library(tidyverse)
library(magrittr)
library(scales)
library(lubridate)
library(zoo)
library(gridExtra)
library(ggridges)
library(gganimate)
library(RCurl)df_videos <- read_csv("videos_watched.csv")
#head(df_videos)## Get a chaacter matrix first
chr_mat = str_split(df_videos$`Topic Code`, "-", n = 5, simplify = TRUE)
## Add headers to character matrix
colnames(chr_mat) <- c("Subject", "Grade", "Curriculum", "ChNo", "TopicNo")
topic_code_info <- as_tibble(chr_mat)
#head(topic_code_info)df_videos %<>%
bind_cols(topic_code_info) %>%
mutate(Grade = as.integer(Grade))
#head(df_videos)df_videos %<>%
filter(! is.na(`Topic Code`))df_quizzes <- read_csv("quiz_scores.csv")
#head(df_quizzes)## Get a character matrix first
chr_mat2 = str_split(df_quizzes$`Chapter Code`, "-", n = 4, simplify = TRUE)
## Add headers to character matrix
colnames(chr_mat2) <- c("Subject", "Grade", "Curriculum", "ChNo")
chapter_code_info <- as_tibble(chr_mat2)
#head(chapter_code_info)df_quizzes %<>%
bind_cols(chapter_code_info) %>%
mutate(Grade = as.integer(Grade))
#head(df_quizzes)df_quizzes %<>%
filter(! is.na(`Chapter Code`))
#dim(df_quizzes)successful_signups <- read_csv("successful_signups.csv")## Parsed with column specification:
## cols(
## Created.Date = col_datetime(format = ""),
## DistinctID = col_character(),
## `Invited?` = col_character(),
## Name = col_character(),
## Phone = col_double(),
## UTM.Source = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 6 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1414 Created.Date "date like " २०१९-०३-२४ १३:२२:… 'successful_signups.c… file 2 3960 Created.Date "date like " २०१९-०३-२६ १३:१२:… 'successful_signups.c… row 3 5152 Created.Date "date like " २०१९-०१-०९ १२:४०:… 'successful_signups.c… col 4 13796 Created.Date "date like " २०१९-०२-०६ ०१:३६:… 'successful_signups.c… expected 5 14141 Created.Date "date like " २०१८-१२-१५ १९:३३:… 'successful_signups.c…
## ... ................. ... ........................................................................... ........ ........................................................................... ...... ........................................................................... .... ........................................................................... ... ........................................................................... ... ........................................................................... ........ ...........................................................................
## See problems(...) for more details.
head(successful_signups)au <- read_csv("mau.csv")## Parsed with column specification:
## cols(
## DistinctID = col_character(),
## Name = col_character(),
## Phone = col_double(),
## Email = col_character(),
## BatchCode = col_character(),
## City = col_character(),
## LastActiveDate = col_date(format = ""),
## NumDaysActive = col_integer(),
## NumQuizzes = col_integer(),
## NumVideos = col_integer(),
## NumPDFs = col_integer(),
## PDF_secs = col_double(),
## Video_secs = col_double()
## )
glimpse(au)## Observations: 50,189
## Variables: 13
## $ DistinctID <chr> "015072f1-eba1-4170-a4f3-ac2237e77b9a", "0168955b…
## $ Name <chr> "Santosh keshri", "Manish Kumar", "Nikhil Raghuwa…
## $ Phone <dbl> 919431177166, 917079161488, 919589658477, 9195602…
## $ Email <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ BatchCode <chr> "AGM05", "AGM05", "AGM05", "AGM05", "AGM05", "AGM…
## $ City <chr> "Patna", "Bengaluru", "Bhilai", "New Delhi", NA, …
## $ LastActiveDate <date> 2019-03-30, 2018-08-17, 2018-08-17, 2019-01-15, …
## $ NumDaysActive <int> 15, 1, 1, 2, 4, 3, 2, 3, 1, 4, 1, 1, 2, 2, 12, 5,…
## $ NumQuizzes <int> 0, 0, 0, 3, 4, 3, 1, 1, 0, 1, 0, 0, 0, 0, 8, 3, 1…
## $ NumVideos <int> 0, 0, 1, 3, 5, 2, 3, 5, 2, 0, 0, 1, 6, 1, 13, 4, …
## $ NumPDFs <int> 67, 2, 1, 4, 5, 14, 1, 0, 4, 18, 2, 0, 0, 4, 29, …
## $ PDF_secs <dbl> 2271.424, 41.863, 23.652, 107.438, 899.305, 3141.…
## $ Video_secs <dbl> 0, 0, 1, 7, 25, 7, 255, 167, 40, 0, 0, 8, 13, 58,…
quizlist <- read_csv("CMS Problem Quizzes - 19_Apr_2019.csv")## Warning: Missing column names filled in: 'X11' [11]
## Parsed with column specification:
## cols(
## ID = col_character(),
## Title = col_character(),
## `Parent Type` = col_character(),
## `Parent Name` = col_character(),
## `Parent Code` = col_character(),
## `Source Type` = col_character(),
## `Source Name` = col_character(),
## `Source Code` = col_character(),
## Tag = col_character(),
## `Set ID` = col_character(),
## X11 = col_character()
## )
#head(quizlist)ch_quiz_list <- quizlist %>% filter(`Parent Type` == "Chapter")
top_quiz_list <- quizlist %>% filter(`Parent Type` == "Topic")
num_ch_quizzes <- ch_quiz_list %>% group_by(`Parent Code`) %>% summarise(ch_quiz = n())
num_top_quizzes <- top_quiz_list %>%
mutate(`Parent Code` = str_sub(`Parent Code`, 1, str_length(`Parent Code`)-3)) %>%
group_by(`Parent Code`) %>% summarise(top_quiz = n())
num_top_quizzestot_quizzes <- full_join(num_ch_quizzes, num_top_quizzes, by = "Parent Code")
tot_quizzes %<>%
rowwise() %>%
mutate(num_quiz = sum(ch_quiz, top_quiz, na.rm = TRUE))
tot_quizzeschr_mat3 = str_split(tot_quizzes$`Parent Code`, "-", n = 4, simplify = TRUE)
colnames(chr_mat3) <- c("Subject", "Grade", "Curriculum", "ChNo")
chapter_code_info <- as_tibble(chr_mat3)
tot_quizzes %<>%
bind_cols(chapter_code_info) %>%
mutate(Grade = as.integer(Grade))
tot_quizzesdf_videos %>%
mutate(perc_watched = round(100*`Watch Duration` / `Video Duration`),0) %>%
group_by(Grade, Subject) %>%
summarise(num_vids = n(),
Avg_perc_watched = round(mean(perc_watched, na.rm = TRUE),0)) %>%
ggplot(mapping = aes(x = Grade, y = Subject))+
geom_tile(aes(fill = Avg_perc_watched)) +
scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
midpoint = 30, space = "Lab", na.value = "grey50",
guide = "colourbar", aesthetics = "fill") +
theme_bw() ## Math is super low other than grade nine - what’s different? ## 11/12 watch rate is low in general - do 11/12 care less about videos and more about problems? ## Video content audit - Watch % by chapters
df_videos %>%
mutate(perc_watched = round(100*`Watch Duration` / `Video Duration`),0) %>%
group_by(Grade, Subject, ChNo) %>%
summarise(Avg_perc_watched = round(mean(perc_watched, na.rm = TRUE),0)) %>%
# Num_vids = n()) %>%
ggplot(mapping = aes(x = Subject, y = ChNo))+
geom_tile(aes(fill = Avg_perc_watched)) +
facet_grid(cols = vars(Grade)) +
theme_bw() +
scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
midpoint = 30, space = "Lab", na.value = "grey50",
guide = "colourbar", aesthetics = "fill") ## @Garima - What’s going on with the specific green chapters - for instance 10.M.15 - maybe kids come for revision and then see the last chapter. Ignoring this - 11.C.10 - C12 (mohina popular on youtube) and 11.P.11 - 12 (Ashwin BB Radiation) seem to be doing well. 12.P.17-19 seems surprisingly good. PSV’s have much much higher watch rate than videos (Pritesh add chart) - Also see (Hindi) vs. not (Hindi)
content_audit <- df_videos %>%
mutate(Video.Duration = as.integer(`Video Duration`)) %>%
mutate(perc_watched = round(100*`Watch Duration` / `Video Duration`),0) %>%
group_by(Grade, Subject, ChNo, `Chapter Name`, `Video Name`, Video.Duration) %>%
summarise(Avg_perc_watched = round(mean(perc_watched, na.rm = TRUE),0),
Num_video_opened = n()) ## Warning: NAs introduced by coercion to integer range
content_auditknitr::opts_chunk$set(message = FALSE, warning = FALSE)
p <- df_videos %>%
filter(Grade == 11) %>%
mutate(Month = as.yearmon(`Time Finished`, "%m/%Y")) %>%
group_by(Subject, ChNo, Month) %>%
summarise(num_vids = n()) %>%
#filter(num_vids < 500) %>%
ggplot(mapping = aes(x = Subject, y = ChNo, fill = num_vids))+
geom_tile() +
scale_fill_viridis_c() +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
# facet_grid( shrink = FALSE, space = 'free') +
facet_wrap(vars(Month), ncol = 3) ## facet grid prints all in 1 column
#labs(title = 'Month: {closest_state}') +
#transition_states(Month, transition_length = 3, state_length = 6) +
#ease_aes('linear')
pReplace with % completed
df_quizzes %>%
mutate(Month = as.yearmon(`Time Finished`, "%m/%Y")) %>%
group_by(Grade, Subject, Month) %>%
summarise(num_quizzes = n()) %>%
ggplot(mapping = aes(x = Grade, y = Subject))+
geom_tile(aes(fill = num_quizzes)) +
#scale_fill_viridis_c() +
theme_bw() +
scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
midpoint = 500, space = "Lab", na.value = "grey50",
guide = "colourbar", aesthetics = "fill") ## theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
## panel.background = element_blank(), axis.line = element_line(colour = "black"))
## labs(title = 'Month: {closest_state}') +
## transition_states(Month, transition_length = 3, state_length = 6) +
## ease_aes('linear')df_quizzes %>%
#filter(Grade == 11) %>%
#filter(as.Date(`Time Finished`) > "2018-09-01") %>%
#filter(as.Date(`Time Finished`) < "2018-12-01") %>%
#mutate(Month = as.yearmon(`Time Finished`, "%m/%Y")) %>%
#group_by(Subject, ChNo, Month) %>%
group_by(Grade, Subject, ChNo) %>%
summarise(num_quizzes = n()) %>%
ggplot(mapping = aes(x = Subject, y = ChNo))+
geom_tile(aes(fill = num_quizzes)) +
# scale_fill_viridis_c() +
scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
midpoint = 2000, space = "Lab", na.value = "grey50",
guide = "colourbar", aesthetics = "fill") +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
facet_wrap(vars(Grade), ncol = 4, nrow = 3) ## facet grid prints all in 1 column #labs(title = 'Month: {closest_state}') +
#transition_states(Month, transition_length = 3,state_length = 6) +
#ease_aes('linear')Quiz usage is pretty high for 1st few chapters of 10th and 11th, but pretty low otherwise
mom_videos <- df_videos %>%
mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
DoW = wday(`Time Finished`)
) %>%
group_by(Month) %>%
summarise(Count = n()) %>%
ggplot(aes(x= reorder(as.character(Month), Month), y=Count)) +
geom_bar(stat = "identity", fill = "steelblue", color = "black") +
geom_text(aes(label = Count), hjust = 1, size = 4, color = "white") +
theme_minimal()+
labs(title = "# Videos viewed", x= "Month", y= "Count") +
coord_flip()
### MoM usage - Quizzes
mom_quizzes <- df_quizzes %>%
mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
DoW = wday(`Time Finished`)
) %>%
group_by(Month) %>%
summarise(Count = n()) %>%
ggplot(aes(x= reorder(as.character(Month), Month), y=Count)) +
geom_bar(stat = "identity", fill = "orange", color = "black") +
geom_text(aes(label = Count), hjust = 1, size = 4, color = "black") +
theme_minimal()+
labs(title = "# Quizzes completed", x= "Month", y= "Count") +
coord_flip()
grid.arrange(mom_quizzes, mom_videos, ncol = 2)dow_videos <- df_videos %>%
mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
DoW = wday(`Time Finished`, label = TRUE)
) %>%
group_by(DoW) %>%
summarise(Count = n()) %>%
ggplot(aes(x= DoW, y=Count)) +
geom_bar(stat = "identity", fill = "steelblue", color = "black") +
geom_text(aes(label = Count), hjust = 1, size = 4, color = "white") +
theme_minimal()+
labs(title = "# Videos viewed", x= "Day Of Week", y= "Count") +
coord_flip()
dow_quizzes <- df_quizzes %>%
mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
DoW = wday(`Time Finished`, label = TRUE)
) %>%
group_by(DoW) %>%
summarise(Count = n()) %>%
ggplot(aes(x= DoW, y=Count)) +
geom_bar(stat = "identity", fill = "orange", color = "black") +
geom_text(aes(label = Count), hjust = 1, size = 4, color = "black") +
theme_minimal()+
labs(title = "# Quizzes completed", x= "Day Of Week", y= "Count") +
coord_flip()
grid.arrange(dow_quizzes, dow_videos, ncol = 2)mom_distinct_videos <- df_videos %>%
mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
DoW = wday(`Time Finished`)
) %>%
group_by(Month) %>%
summarise(DistinctUsers = n_distinct(DistinctID)) %>%
ggplot(aes(x= reorder(as.character(Month), Month), y=DistinctUsers)) +
geom_bar(stat = "identity", fill = "steelblue", color = "black") +
geom_text(aes(label = DistinctUsers), hjust = 1, size = 4, color = "white") +
theme_minimal()+
labs(title = "# Distinct Users watching videos", x= "Month", y= "Count") +
coord_flip()
mom_distinct_quizzes <- df_quizzes %>%
mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
DoW = wday(`Time Finished`)
) %>%
group_by(Month) %>%
summarise(DistinctUsers = n_distinct(DistinctID)) %>%
ggplot(aes(x= reorder(as.character(Month), Month), y=DistinctUsers)) +
geom_bar(stat = "identity", fill = "orange", color = "black") +
geom_text(aes(label = DistinctUsers), hjust = 1, size = 4, color = "black") +
theme_minimal()+
labs(title = "# Distinct Users finishing Quizzes", x= "Month", y= "Count") +
coord_flip()
grid.arrange(mom_distinct_quizzes, mom_distinct_videos, ncol = 2)hour_distinct_videos <- df_videos %>%
mutate(Hour = hour(`Time Finished`)) %>%
group_by(Hour) %>%
summarise(DistinctUsers = n_distinct(DistinctID)) %>%
ggplot(aes(x= Hour, y=DistinctUsers)) +
geom_bar(stat = "identity", fill = "steelblue", color = "black") +
geom_text(aes(label = DistinctUsers), hjust = 1, size = 4, color = "white") +
theme_minimal() +
labs(title = "# Distinct Users watching videos", x= "Hour of the day", y= "DistinctUsers") +
coord_flip()
### Hour Of Day usage - Quizzes
hour_distinct_quizzes <- df_quizzes %>%
mutate(Hour = hour(`Time Finished`)) %>%
group_by(Hour) %>%
summarise(DistinctUsers = n_distinct(DistinctID)) %>%
ggplot(aes(x= Hour, y=DistinctUsers)) +
geom_bar(stat = "identity", fill = "orange", color = "black") +
geom_text(aes(label = DistinctUsers), hjust = 1, size = 4, color = "black") +
theme_minimal() +
labs(title = "# Distinct Users giving quizzes", x= "Hour of the day", y= "DistinctUsers") +
coord_flip()
### Hour of the day usage - Quizzes vs Videos
grid.arrange(hour_distinct_quizzes, hour_distinct_videos, ncol = 2)video_life <- df_videos %>%
group_by(DistinctID) %>%
summarise(First_Video = min(`Time Finished`),
Last_Video = max(`Time Finished`))
quiz_life <- df_quizzes %>%
group_by(DistinctID) %>%
summarise(First_Quiz = min(`Time Finished`),
Last_Quiz = max(`Time Finished`))
### Join video and quiz lifetimes
activity_life <- full_join(video_life, quiz_life, by = "DistinctID")
### Common start and end dates for video / quizzes
activity_life %<>%
mutate(First_Video_Quiz = if_else(is.na(First_Video), First_Quiz,
if_else(is.na(First_Quiz), First_Video,
if_else(First_Video < First_Quiz, First_Video, First_Quiz)
)
),
Last_Video_Quiz = if_else(is.na(Last_Video), Last_Quiz,
if_else(is.na(Last_Quiz), Last_Video,
if_else(Last_Video > Last_Quiz, Last_Video, Last_Quiz)
)
)
)
activity_life %<>%
mutate(Age = as.integer(difftime(Last_Video_Quiz, First_Video_Quiz, units = "days")))
#sample_n(activity_life, 10)At what stage / week do they watch video or quiz, etc
all_events <- df_quizzes %>%
select(DistinctID, `Chapter Name`, `Time Finished`) %>%
mutate(Event = 'Quiz')
tmp <- df_videos %>%
select(DistinctID, `Chapter Name`, `Time Finished`) %>%
mutate(Event = 'Video')
all_events %<>%
bind_rows(tmp)
## Calculate the event age based on first activity of the user
all_events %<>%
left_join(select(activity_life, DistinctID, First_Video_Quiz, Last_Video_Quiz, Age),
by = "DistinctID") %>%
mutate(Day_No = as.integer(difftime(`Time Finished`, First_Video_Quiz, units = "days"))) %>%
mutate(Days_to_Die = as.integer(difftime(Last_Video_Quiz, `Time Finished`, units = "days")))
## Add the activity count serially per user
all_events %<>%
group_by(DistinctID) %>%
arrange(`Time Finished`, .by_group=TRUE) %>%
mutate(Event_No = row_number())activity_life %>%
filter(Age > 0) %>%
ggplot(aes(x = Age)) +
geom_histogram(binwidth = 1) +
coord_cartesian(xlim = c(-10,100)) +
geom_vline(aes( xintercept = median(Age, na.rm = TRUE)))Vertical line indicates median age, ~15 days or 2 weeks.
activity_life %>%
mutate(Start_Month = as.yearmon(First_Video_Quiz, "%m/%Y")) %>%
filter(Age > 0) %>%
ggplot(aes(x = Age, y = reorder(as.character(Start_Month), Start_Month))) +
geom_density_ridges() +
coord_cartesian(xlim = c(0, 30)) +
labs(title = "Age distribution of app users by monthly cohorts", y = "Month", x = "Age (# days from first video / quiz to last video / quiz)")quiz_ridge <- quiz_life %>%
mutate(Start_Month = as.yearmon(First_Quiz, "%m/%Y")) %>%
mutate(Age = as.integer(difftime(Last_Quiz, First_Quiz, units = "days"))) %>%
filter(Age > 0) %>%
ggplot(aes(x = Age, y = reorder(as.character(Start_Month), Start_Month))) +
geom_density_ridges() +
coord_cartesian(xlim = c(0, 365)) +
labs(title = "Age distribution of quiz users", y = "Month", x = "Age (# days from first quiz to last quiz)")
quiz_ridgevideo_ridge <- video_life %>%
mutate(Start_Month = as.yearmon(First_Video, "%m/%Y")) %>%
filter(as.Date(First_Video) > as.Date("2018-04-30")) %>%
mutate(Age = as.integer(difftime(Last_Video, First_Video, units = "days"))) %>%
filter(Age > 0) %>%
ggplot(aes(x = Age, y = reorder(as.character(Start_Month), Start_Month))) +
geom_density_ridges() +
coord_cartesian(xlim = c(0, 365)) +
labs(title = "Age distribution of video users", y = "Month", x = "Age (# days from first video to last video)")
video_ridgedead_users <- activity_life %>%
filter(Last_Video_Quiz < as.Date("2018-10-31"))
dead_user_events <- all_events %>%
filter(Last_Video_Quiz < as.Date("2018-10-31"))dead_users %<>%
left_join(df_videos, by = c("DistinctID"="DistinctID", "Last_Video"="Time Finished")) %>%
select(-`Video Duration`, -`Topic Code`, -`Video Name`, -`Watch Duration`, -Subject,
-Grade, -ChNo, -TopicNo, -Curriculum, -`Chapter Name`)
### Let's check the credit balance of dead users who used the app for atleast 2 days
dead_users %>%
filter(Age > 1) %>%
# ggplot(aes(x=log10(1+`Credits Balance`)))+
ggplot(aes(x=`Credits Balance`))+
geom_histogram(binwidth = 10) +
coord_cartesian(xlim = c(-20,400)) +
geom_vline(aes( xintercept = median(`Credits Balance`, na.rm = TRUE)))Let’s see what the dead users did in their last 7 days We should check if the video watch rates are low OR the quiz scores are low
#all_events %>%
dead_user_events %>%
filter(Age > 60) %>%
filter(Days_to_Die < 8) %>%
group_by(Days_to_Die, Event) %>%
summarise(count = n()) %>%
#spread(key = Event, value = count) %>%
ggplot() +
geom_line(aes(x = Days_to_Die, y = count, colour = Event)) More than usual quizzes and videos were watched on the last day !! This points to intermittent usage by the dead users. Hence, we do not see a lot of daily activity for them. Let’s plot this weekly
#all_events %>%
dead_user_events %>%
filter(Age > 200) %>%
#filter(Days_to_Die < 8) %>%
mutate(Weeks_to_Die = as.integer(Days_to_Die / 7)) %>%
group_by(Weeks_to_Die, Event) %>%
summarise(count = n()) %>%
#spread(key = Event, value = count) %>%
ggplot() +
geom_line(aes(x = Weeks_to_Die, y = count, colour = Event))This makes more sense ! There is a drop in user activity before they use the app for the last time
Let’s compare with the first 7 days
all_events %>%
filter(Age > 100) %>%
filter(Day_No < 100) %>%
group_by(Day_No, Event) %>%
summarise(count = n()) %>%
#spread(key = Event, value = count) %>%
ggplot() +
geom_line(aes(x = Day_No, y = count, colour = Event))Vertical line indicates the median no of credits for dead users, which is ~ 20
df_videos %>%
mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
ggplot(aes(x = perc_watched)) +
geom_histogram(binwidth = 5) +
geom_vline(aes( xintercept = mean(perc_watched))) +
# geom_text(label = mean(perc_watched)) +
coord_cartesian(xlim = c(-5, 150))At event level, mean watch rate is 28% (At video level,we have some videos with > 100% median watch rate. Mean is ~10% ) Compare with YouTube watch rates
Avg watch duration for our videos, in secs, is ~13 secs
df_videos %>%
ggplot(aes(as.integer(`Watch Duration`)))+
geom_histogram(binwidth = 50) +
geom_vline(aes( xintercept = median(`Watch Duration`))) +
# geom_text(aes(label = mean(Median_Watch_Rate)))+
coord_cartesian(xlim = c(0, 500))df_videos %>%
mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
group_by(Subject, Grade, ChNo, `Video Name`) %>%
summarise(Median_Watch_Rate = round(median(perc_watched, na.rm = TRUE), 0)) %>%
arrange(desc(Median_Watch_Rate)) Are students watching videos sequantially ? or specifically something for revision ? We already saw students usually begin from Chapter 1 always
df_videos %>%
left_join(video_life, by = "DistinctID") %>%
mutate(Video_Age = as.integer(difftime(`Time Finished`, First_Video, units = "days"))) %>%
filter(Grade == 11) %>%
ggplot(aes(y = Video_Age, x = ChNo))+
geom_boxplot()Most students reach the end of the cuuriculum within 2-3 months from their start date
Let’s check if % video watched decays over time
boxplot_watch_rate <- df_videos %>%
mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
left_join(video_life, by = "DistinctID") %>%
mutate(Event_Age = as.integer(difftime(`Time Finished`, First_Video, units = "days"))) %>%
mutate(Video_Age = as.integer(difftime(Last_Video, First_Video, units = "days"))) %>%
mutate(Week_No = as.integer(Event_Age / 7)) %>%
filter(Video_Age > 60) %>%
ggplot(aes(x = Week_No, y = perc_watched, group = Week_No))+
geom_boxplot(notch=FALSE, fill="blue", alpha=0.2, outlier.size = 0.2) +
coord_cartesian(ylim=c(0, 300)) ## Zoom Without removing data
boxplot_watch_rateThere is no decay, in general, of watch %age over time. In fact, the longer the students stay, they tend to watch videos for longer.
Let’s try a lineplot for each distinct user for their video watch rate
lineplot_watch_rate <- df_videos %>%
mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
left_join(video_life, by = "DistinctID") %>%
mutate(Event_Age = as.integer(difftime(`Time Finished`, First_Video, units = "days"))) %>%
mutate(Video_Age = as.integer(difftime(Last_Video, First_Video, units = "days"))) %>%
mutate(Week_No = as.integer(Event_Age / 7)) %>%
filter(Video_Age > 200) %>%
ggplot(aes(x = Week_No, y = perc_watched, colour = DistinctID)) +
geom_line() +
theme(legend.position = "none") +
coord_cartesian(ylim=c(0, 300)) ## Zoom Without removing data
lineplot_watch_rateWild variation between successive videos being watched
Let’s check the boxplot but only for dead users.
df_videos %>%
semi_join(dead_users, by = "DistinctID") %>%
mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
left_join(video_life, by = "DistinctID") %>%
mutate(Video_Age = as.integer(difftime(`Time Finished`, First_Video, units = "days"))) %>%
mutate(Week_No = as.integer(Video_Age / 7)) %>%
ggplot(aes(x = Week_No, y = perc_watched, group = Week_No))+
geom_boxplot(notch=FALSE, fill="blue", alpha=0.2, outlier.size = 0.2) +
coord_cartesian(ylim=c(0, 300)) ## Zoom Without removing dataWe can see a decay in the outlier watch % over weeks, but the median doesn’t change much. So students dropping off after getting bored of videos doesn’t seem likely.
Are students attempting quizzes sequentially ?
df_quizzes %>%
left_join(quiz_life, by = "DistinctID") %>%
mutate(Quiz_Age = as.integer(difftime(`Time Finished`, First_Quiz, units = "days"))) %>%
filter(Grade == 11) %>%
ggplot(aes(y = Quiz_Age, x = ChNo))+
geom_boxplot()Most students do attempt quizzes sequentially only.
all_events %>%
# filer(Age > 60) %>%
group_by(DistinctID, Day_No) %>%
summarise(Total_Videos = sum(Event == 'Video'),
Total_Quizzes = sum(Event == 'Quiz')) %>%
ggplot(aes(x = Total_Videos, y = Total_Quizzes)) +
geom_point(aes(alpha = 0.001)) +
theme_bw() +
coord_cartesian(xlim = c(-1,100), ylim = c(-1,100)) ## labs(title = '# Days since first activity : {closest_state}') +
## transition_states(Day_No, transition_length = 3, state_length = 6) +
## ease_aes('linear') all_events %>%
mutate(Week_No = as.integer(Day_No / 7)) %>%
filter(Week_No < 16) %>%
group_by(DistinctID, Week_No) %>%
summarise(Total_Videos = sum(Event == 'Video'),
Total_Quizzes = sum(Event == 'Quiz')) %>%
ggplot(aes(x = Total_Videos, y = Total_Quizzes)) +
geom_point(aes(alpha = 0.001)) +
theme_bw() +
coord_cartesian(xlim = c(-1,100), ylim = c(-1,100)) +
facet_wrap(vars(Week_No), nrow = 4, ncol = 4)Not much difference from Weeks 3 to 7 !
Let’s check mean no of quizzes and videos per week
all_events %>%
## filter(Age > 200) %>%
mutate(Week_No = as.integer(Day_No / 7)) %>%
#filter(Week_No < 10) %>%
group_by(Week_No, DistinctID) %>%
summarise(Num_Videos = sum(Event == 'Video'),
Num_Quizzes = sum(Event == 'Quiz')) %>%
group_by(Week_No) %>%
summarise(Mean_Videos = round(mean(Num_Videos, na.rm = TRUE),0),
Mean_Quizzes = round(mean(Num_Quizzes, na.rm = TRUE),0))Let’s plot this using boxplot to see the variation in no of events per week
all_events %>%
## filter(Age > 200) %>%
mutate(Week_No = as.integer(Day_No / 7)) %>%
#filter(Week_No < 10) %>%
group_by(Week_No, DistinctID) %>%
summarise(Num_Videos = sum(Event == 'Video'),
Num_Quizzes = sum(Event == 'Quiz')) %>%
ungroup() %>%
#filter(Week_No < 30) %>%
group_by(Week_No) %>%
ggplot(aes(x = Week_No, y = Num_Videos, group = Week_No))+
geom_boxplot(notch=FALSE, fill="red", alpha=0.2, outlier.size = 0.2) +
coord_cartesian(ylim=c(0, 25)) ## Without removing data, just zoomWe see that the median no of videos are only 2-3 per week, but the outliers are pulling the mean upto 5. After Week 30, the sample size is pretty small so the outliers are pulling the median up.
Let’s check box plot for quizzes
all_events %>%
#filter(Age > 200) %>%
mutate(Week_No = as.integer(Day_No / 7)) %>%
#filter(Week_No < 10) %>%
group_by(Week_No, DistinctID) %>%
summarise(Num_Videos = sum(Event == 'Video'),
Num_Quizzes = sum(Event == 'Quiz')) %>%
ungroup() %>%
#filter(Week_No < 30) %>%
group_by(Week_No) %>%
ggplot(aes(x = Week_No, y = Num_Quizzes, group = Week_No))+
geom_boxplot(notch=FALSE, fill="blue", alpha=0.2, outlier.size = 0.2) +
coord_cartesian(ylim=c(0, 5)) ## Without removing data, just zoomMost students attempt 0-1 (median 0, men 1) quizzes per week, and there seems to be no variation by week, except the decay of outliers. Makes sense though, as each topic has roughly 4-5 videos and only 1 quiz.
weekly_user_activity <- all_events %>%
filter(Age > 300) %>%
mutate(Week_No = as.integer(Day_No / 7)) %>%
#filter(Week_No < 10) %>%
group_by(Week_No, DistinctID) %>%
summarise(Activity_Level = as.integer(n() / 10),
IsActive = if_else(n() > 10, 1, 0),
Total_Activities_Till_Date = max(Event_No)) %>%
ungroup()
weekly_user_activity %>%
#mutate(Cumulative_Activities = gr)
ggplot(aes(x = Week_No, y = IsActive, colour = DistinctID, alpha = 0.1)) +
geom_line() +
theme(legend.position = "none") Activity states change very rapidly week on week. Let’s check their cumulative activity sum by week
all_events %>%
filter(Age > 300) %>%
filter(Day_No < 30) %>%
mutate(Week_No = as.integer(Day_No / 7)) %>%
#filter(Week_No < 10) %>%
group_by(Week_No, DistinctID) %>%
summarise(Activity_Level = as.integer(n() / 10),
IsActive = if_else(n() > 10, 1, 0),
Total_Activities_Till_Date = max(Event_No)) %>%
ggplot(aes(x = Week_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
geom_line() +
theme(legend.position = "none")Looking at cumulative activity plots for just the videos with a cutoff for qualifying as an event
df_videos %>%
mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
filter(perc_watched > 90) %>%
filter(`Watch Duration` > 20) %>%
left_join(all_events, by = c("DistinctID", "Time Finished")) %>%
# mutate(Week_No = as.integer(Day_No / 7)) %>%
filter(Day_No < 30) %>%
group_by(Day_No, DistinctID) %>%
summarise(Activity_Level = as.integer(n() / 10),
IsActive = if_else(n() > 10, 1, 0),
Total_Activities_Till_Date = max(Event_No)) %>%
ggplot(aes(x = Day_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
geom_line() +
theme(legend.position = "none")Let’s check this graph by day
all_events %>%
filter(Age > 300) %>%
# mutate(Week_No = as.integer(Day_No / 7)) %>%
group_by(Day_No, DistinctID) %>%
summarise(Total_Activities_Till_Date = max(Event_No)) %>%
ggplot(aes(x = Day_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
geom_line() +
theme(legend.position = "none") Some users stay dormant for a long time and come back after 3-6 months.
Let’s check this separately for videos and quizzes
cumulative_video_plot <- all_events %>%
filter(Age > 250) %>%
filter(Event == "Video") %>%
# mutate(Week_No = as.integer(Day_No / 7)) %>%
group_by(Day_No, DistinctID) %>%
summarise(Total_Activities_Till_Date = max(Event_No)) %>%
ggplot(aes(x = Day_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
geom_line() +
ggtitle("Only Videos") +
theme(legend.position = "none")
cumulative_quiz_plot <- all_events %>%
filter(Age > 250) %>%
filter(Event == "Quiz") %>%
# mutate(Week_No = as.integer(Day_No / 7)) %>%
group_by(Day_No, DistinctID) %>%
summarise(Total_Activities_Till_Date = max(Event_No)) %>%
ggplot(aes(x = Day_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
geom_line() +
ggtitle("Only Quizzes") +
theme(legend.position = "none")
grid.arrange(cumulative_video_plot, cumulative_quiz_plot)quiz_usage <- df_quizzes %>%
mutate(perc_score = round((100* `Correct Questions` / `Total Questions`),0)) %>%
group_by(DistinctID, `Chapter Code`) %>%
summarise(finished = n(),
avg_score = mean(perc_score, na.rm = TRUE)) %>%
left_join(tot_quizzes, by = c("Chapter Code" = "Parent Code")) %>% #Total quizzes available chapter
mutate(usage = round((100* finished / num_quiz),0))
sample_frac(quiz_usage)Were the quizzes too easy ? Avg score > 50%
df_quizzes %>%
mutate(perc_score = round((100* `Correct Questions` / `Total Questions`),0)) %>%
group_by(Grade, Subject, ChNo) %>%
summarise(num_users = n_distinct(DistinctID),
Avg_Score = mean(perc_score, na.rm = TRUE)
) %>%
ggplot(mapping = aes(x = Subject, y = ChNo))+
geom_tile(aes(fill = Avg_Score)) +
facet_grid(cols = vars(Grade)) +
theme_bw() +
scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
midpoint = 50, space = "Lab", na.value = "grey50",
guide = "colourbar", aesthetics = "fill")Currently not accurate as multiple events can be logged for the same quiz and same user
quiz_usage %>%
group_by(Grade, Subject, ChNo) %>%
summarise(Median_perc_quizzes_completed = median(usage, na.rm = TRUE)
) %>%
ggplot(mapping = aes(x = Subject, y = ChNo))+
geom_tile(aes(fill = Median_perc_quizzes_completed)) +
facet_grid(cols = vars(Grade)) +
theme_bw() +
scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
midpoint = 50, space = "Lab", na.value = "grey50",
guide = "colourbar", aesthetics = "fill")Let’s check distribution of users by their % quiz completed per chapter
quiz_usage %>%
group_by(DistinctID) %>%
summarise(Mean_perc_quizzes_completed_per_chapter = mean(usage, na.rm = TRUE),
num_chapters_attempted = n()) %>%
filter(num_chapters_attempted > 10) %>%
ggplot() +
geom_histogram(aes(x=Mean_perc_quizzes_completed_per_chapter), binwidth = 5) +
geom_vline(aes( xintercept = mean(Mean_perc_quizzes_completed_per_chapter))) +
# geom_text
coord_cartesian(xlim=c(-5,120))au_signups <- left_join(successful_signups, au, by = "DistinctID")
sample_n(au_signups, 20)au_signups %>%
group_by(UTM.Source) %>%
summarise(Count = n(),
Avg_Videos_Watched = mean(NumVideos, na.rm = TRUE),
Avg_Num_Days_Active = mean(NumDaysActive, na.rm = TRUE),
Avg_Quizzes_Completed = mean(NumQuizzes, na.rm = TRUE)) %>%
arrange(desc(Count)) #filter(Count > 200) %>%
#filter(! UTM.Source %in% c("NA", "(not set)")) %>%
#ggplot() +
#geom_barplot(aes(x = UTM.Source))p1 <- au_signups %>%
filter( UTM.Source %in% c("google-play", "YouTube", "avanti.in", "Google Form", "Print Media")) %>%
ggplot() +
geom_boxplot(aes(x = UTM.Source, y = NumQuizzes)) ## change y to NumVideos or NumDaysActive
p2 <- au_signups %>%
filter( UTM.Source %in% c("google-play", "YouTube", "avanti.in", "Google Form", "Print Media")) %>%
ggplot() +
geom_boxplot(aes(x = UTM.Source, y = NumVideos)) ## change y to NumVideos or NumDaysActive
p3 <- au_signups %>%
filter( UTM.Source %in% c("google-play", "YouTube", "avanti.in", "Google Form", "Print Media")) %>%
ggplot() +
geom_boxplot(aes(x = UTM.Source, y = NumDaysActive)) ## change y to NumVideos or NumDaysActive
grid.arrange(p1, p2, p3, nrow = 3)There is no major difference between different students from different campaigns !
## Time to run out of credits - Credit Balance decay
## Left NOT due to credit balance
## Not enough quiz questions ? - # of chapters with > 50% quizzes attempted
## Who leaves in Wk1 ?
## Who leaves in Wk2 ?
## Who leaves in Wk3 ?
## Who leaves in Wk4 ?incomplete_quizzes <- read_csv("incomplete_quizzes.csv")
glimpse(incomplete_quizzes)## Observations: 9,814
## Variables: 12
## $ `Chapter Code` <chr> "CHM-11-JEE-02", "CHM-11-JEE-02", "CHM-11-JE…
## $ `Chapter Name` <chr> "Atomic Structure", "Atomic Structure", "Per…
## $ `Correct Questions` <int> 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 0, 0, 1,…
## $ `Credits Balance` <int> 20, 20, 40, 40, 40, 20, 40, 10, 12, 20, 40, …
## $ DistinctID <chr> "bddbbc80-d105-42a5-a15d-e64db5133ad1", "bdd…
## $ Name <chr> "Prachi", "Prachi", "Prachi", "Prachi", "Ram…
## $ `Quiz Name` <chr> "Practice Quiz 1", "Practice Quiz 2", "Early…
## $ Subject <chr> "Chemistry", "Chemistry", "Chemistry", "Chem…
## $ `Time Finished` <dttm> 2018-07-28 14:27:21, 2018-07-28 14:29:19, 2…
## $ `Time Spent(s)` <int> 115, 200, 271, 252, 126, 52, 58, 31, 168, 98…
## $ `Total Questions` <int> 5, 5, 7, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 4,…
## $ UserID <chr> "2013Prachipant@gmail.com", "2013Prachipant@…
Anti join with the quiz_scores by DistinctID, Chapter Code & Quiz Name
incomplete_quiz_only <- anti_join(incomplete_quizzes, df_quizzes,
by = c("DistinctID", "Chapter Code", "Quiz Name"))
#sample_n(incomplete_quiz_only, 20)chr_mat4 = str_split(incomplete_quiz_only$`Chapter Code`, "-", n = 4, simplify = TRUE)
colnames(chr_mat4) <- c("Subject", "Grade", "Curriculum", "ChNo")
chapter_code_info <- as_tibble(chr_mat4)
incomplete_quiz_only %<>%
bind_cols(chapter_code_info) %>%
mutate(Grade = as.integer(Grade))
incomplete_quiz_only %>%
mutate(perc_score = round((100* `Correct Questions` / `Total Questions`),0)) %>%
group_by(Grade, Subject, ChNo) %>%
summarise(num_users = n_distinct(DistinctID),
Avg_Score = mean(perc_score, na.rm = TRUE)
) %>%
ggplot(mapping = aes(x = Subject, y = ChNo))+
geom_tile(aes(fill = Avg_Score)) +
facet_grid(cols = vars(Grade)) +
theme_bw() +
scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
midpoint = 50, space = "Lab", na.value = "grey50",
guide = "colourbar", aesthetics = "fill")Clearly, students who do not complete quizzes do so due to poor scores on Questions attempted thus far. But, only 6413 out of 52456 remain unfinished in the last 1 year.
quizzes <- read.delim(textConnection(getURL(
"https://avanti-dump.s3-ap-southeast-1.amazonaws.com/quiz-resp-dumps/quizzes.tsv")),
header = FALSE, stringsAsFactors = FALSE)
colnames(quizzes) <- c("LMSTestID", "CMSQuizID", "ParentType", "ParentID", "QuizName", "Tags")
sample_n(quizzes, 10)We have 4 kinds of chapter quizzes and then some topic quizzes. Let’s add quiz type
quizzes %<>% mutate(Quiz_Type = if_else(ParentType == "Topic", "TopicQuiz", Tags))
sample_n(quizzes, 10)quiz_responses <- read.delim(textConnection(
getURL("https://avanti-dump.s3-ap-southeast-1.amazonaws.com/quiz-resp-dumps/responses.tsv")),
header = FALSE, stringsAsFactors = FALSE)
colnames(quiz_responses) <- c("Timestamp","LMSTestID","StudentID", "ProblemID", "Response")
sample_n(quiz_responses, 10)response_count <- quiz_responses %>% group_by(LMSTestID, Response) %>% summarise(Count = n()) %>% spread(Response, Count)
quizzes %<>% left_join(response_count, by = "LMSTestID")
sample_n(quizzes, 10)stack_bar_plot <- quizzes %>%
filter(! Quiz_Type == "TopicQuiz") %>%
gather(Correct, Incorrect, `Not Visited`, Skipped, Attempted, key = "Response", value = "ProblemCounts") %>%
group_by(Quiz_Type, Response) %>%
summarise(student_problem_counts = sum(ProblemCounts, na.rm = TRUE)) %>%
ggplot(aes(x = Quiz_Type, y = student_problem_counts, fill = Response)) +
geom_bar(stat = "identity")
stack_bar_plotperc_stacked_bar_plt <- quizzes %>%
filter(! Quiz_Type == "TopicQuiz") %>%
gather(Correct, Incorrect, `Not Visited`, Skipped, Attempted, key = "Response", value = "ProblemCounts") %>%
group_by(Quiz_Type, Response) %>%
summarise(student_problem_counts = sum(ProblemCounts, na.rm = TRUE)) %>%
ggplot(aes(x = Quiz_Type, y = student_problem_counts, fill = Response)) +
geom_bar(position = "fill", stat = "identity") +
scale_y_continuous(labels = scales::percent_format())
perc_stacked_bar_pltalc <- read_csv("active_users_since.csv") ## Active users since April-1-2019
head(alc)dup <- alc %>%
filter(! is.na(`Centre Name`)) %>%
filter( `Centre Name` %in% c("Avanti Gurukul Mobile App", "Online Students - Pilot",
"Online Students - Paid", "Online JEE Test Series", "Demo Centre 2017")) alc %<>%
filter(! is.na(`Centre Name`)) %>%
filter(! `Centre Name` %in% c("Avanti Gurukul Mobile App", "Online Students - Pilot",
"Online Students - Paid", "Online JEE Test Series", "Demo Centre 2017"))
alc %>%
group_by(`Centre Name`) %>%
summarise(n = n()) %>%
filter(n > 4) %>%
ggplot(aes(x=reorder(`Centre Name`, n), y=n)) +
geom_bar(stat = "identity", fill = "orange", color = "black") +
geom_text(aes(label = n), hjust = 1, size = 4, color = "black") +
theme_minimal()+
labs(title = "Centre-wise distribution (10+ students)", x= "Centre", y= "Count") +
coord_flip()Match dup with studlist
studlist <- read.delim(
textConnection(
getURL("https://s3-ap-southeast-1.amazonaws.com/avanti-dump/dashboard-dumps/studentlist/student_list.tsv")),
header = TRUE, stringsAsFactors = FALSE)
dup %<>%
mutate(Phone = as.character(Phone))
ph_no_match <- inner_join(dup, studlist, by = c("Phone" = "Phone"))
parent_ph_no_match <- inner_join(dup, studlist, by = c("Phone" = "Parent.s.Phone"))