library(tidyverse)
library(magrittr)
library(scales)
library(lubridate)
library(zoo)
library(gridExtra)
library(ggridges)
library(gganimate)
library(RCurl)

Reading and cleaning data

Read mixpanel output

df_videos <- read_csv("videos_watched.csv")
#head(df_videos)

Extracting information from TopicCode

## Get a chaacter matrix first
chr_mat = str_split(df_videos$`Topic Code`, "-", n = 5, simplify = TRUE)
## Add headers to character matrix
colnames(chr_mat) <- c("Subject", "Grade", "Curriculum", "ChNo", "TopicNo")
topic_code_info <- as_tibble(chr_mat)
#head(topic_code_info)

Add curriculum info to df

df_videos %<>%
  bind_cols(topic_code_info) %>%
  mutate(Grade = as.integer(Grade))

#head(df_videos)

Filter rows without TopicCode info

df_videos %<>%
  filter(! is.na(`Topic Code`))

Quiz usage - Read and clean quiz data

df_quizzes <- read_csv("quiz_scores.csv")
#head(df_quizzes)

Extracting information from ChapterCode

## Get a character matrix first
chr_mat2 = str_split(df_quizzes$`Chapter Code`, "-", n = 4, simplify = TRUE)
## Add headers to character matrix
colnames(chr_mat2) <- c("Subject", "Grade", "Curriculum", "ChNo")
chapter_code_info <- as_tibble(chr_mat2)
#head(chapter_code_info)

Add curriculum info to df_quizzes

df_quizzes %<>%
  bind_cols(chapter_code_info) %>%
  mutate(Grade = as.integer(Grade))

#head(df_quizzes)

Fitering out rows without chapter code

df_quizzes %<>%
  filter(! is.na(`Chapter Code`))

#dim(df_quizzes)

No of successful signups last year

successful_signups <- read_csv("successful_signups.csv")
## Parsed with column specification:
## cols(
##   Created.Date = col_datetime(format = ""),
##   DistinctID = col_character(),
##   `Invited?` = col_character(),
##   Name = col_character(),
##   Phone = col_double(),
##   UTM.Source = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 6 parsing failures.
## row # A tibble: 5 x 5 col     row col          expected     actual             file                   expected   <int> <chr>        <chr>        <chr>              <chr>                  actual 1  1414 Created.Date "date like " २०१९-०३-२४ १३:२२:… 'successful_signups.c… file 2  3960 Created.Date "date like " २०१९-०३-२६ १३:१२:… 'successful_signups.c… row 3  5152 Created.Date "date like " २०१९-०१-०९ १२:४०:… 'successful_signups.c… col 4 13796 Created.Date "date like " २०१९-०२-०६ ०१:३६:… 'successful_signups.c… expected 5 14141 Created.Date "date like " २०१८-१२-१५ १९:३३:… 'successful_signups.c…
## ... ................. ... ........................................................................... ........ ........................................................................... ...... ........................................................................... .... ........................................................................... ... ........................................................................... ... ........................................................................... ........ ...........................................................................
## See problems(...) for more details.
head(successful_signups)

Active users last year

au <- read_csv("mau.csv")
## Parsed with column specification:
## cols(
##   DistinctID = col_character(),
##   Name = col_character(),
##   Phone = col_double(),
##   Email = col_character(),
##   BatchCode = col_character(),
##   City = col_character(),
##   LastActiveDate = col_date(format = ""),
##   NumDaysActive = col_integer(),
##   NumQuizzes = col_integer(),
##   NumVideos = col_integer(),
##   NumPDFs = col_integer(),
##   PDF_secs = col_double(),
##   Video_secs = col_double()
## )
glimpse(au)
## Observations: 50,189
## Variables: 13
## $ DistinctID     <chr> "015072f1-eba1-4170-a4f3-ac2237e77b9a", "0168955b…
## $ Name           <chr> "Santosh keshri", "Manish Kumar", "Nikhil Raghuwa…
## $ Phone          <dbl> 919431177166, 917079161488, 919589658477, 9195602…
## $ Email          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ BatchCode      <chr> "AGM05", "AGM05", "AGM05", "AGM05", "AGM05", "AGM…
## $ City           <chr> "Patna", "Bengaluru", "Bhilai", "New Delhi", NA, …
## $ LastActiveDate <date> 2019-03-30, 2018-08-17, 2018-08-17, 2019-01-15, …
## $ NumDaysActive  <int> 15, 1, 1, 2, 4, 3, 2, 3, 1, 4, 1, 1, 2, 2, 12, 5,…
## $ NumQuizzes     <int> 0, 0, 0, 3, 4, 3, 1, 1, 0, 1, 0, 0, 0, 0, 8, 3, 1…
## $ NumVideos      <int> 0, 0, 1, 3, 5, 2, 3, 5, 2, 0, 0, 1, 6, 1, 13, 4, …
## $ NumPDFs        <int> 67, 2, 1, 4, 5, 14, 1, 0, 4, 18, 2, 0, 0, 4, 29, …
## $ PDF_secs       <dbl> 2271.424, 41.863, 23.652, 107.438, 899.305, 3141.…
## $ Video_secs     <dbl> 0, 0, 1, 7, 25, 7, 255, 167, 40, 0, 0, 8, 13, 58,…

Number of Chapter quizzes and Topic quizzes

quizlist <- read_csv("CMS Problem Quizzes - 19_Apr_2019.csv")
## Warning: Missing column names filled in: 'X11' [11]
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   Title = col_character(),
##   `Parent Type` = col_character(),
##   `Parent Name` = col_character(),
##   `Parent Code` = col_character(),
##   `Source Type` = col_character(),
##   `Source Name` = col_character(),
##   `Source Code` = col_character(),
##   Tag = col_character(),
##   `Set ID` = col_character(),
##   X11 = col_character()
## )
#head(quizlist)

Let’s see if the students are running out of questions in the quizzes

ch_quiz_list <- quizlist %>% filter(`Parent Type` == "Chapter")
top_quiz_list <- quizlist %>% filter(`Parent Type` == "Topic")

num_ch_quizzes <- ch_quiz_list %>% group_by(`Parent Code`) %>% summarise(ch_quiz = n())
num_top_quizzes <- top_quiz_list %>% 
  mutate(`Parent Code` = str_sub(`Parent Code`, 1, str_length(`Parent Code`)-3)) %>%
  group_by(`Parent Code`) %>% summarise(top_quiz = n())

num_top_quizzes
tot_quizzes <- full_join(num_ch_quizzes, num_top_quizzes, by = "Parent Code")
tot_quizzes %<>%
  rowwise() %>%
  mutate(num_quiz = sum(ch_quiz, top_quiz, na.rm = TRUE))

tot_quizzes

Extracting information from ChapterCode

chr_mat3 = str_split(tot_quizzes$`Parent Code`, "-", n = 4, simplify = TRUE)
colnames(chr_mat3) <- c("Subject", "Grade", "Curriculum", "ChNo")
chapter_code_info <- as_tibble(chr_mat3)
tot_quizzes %<>%
  bind_cols(chapter_code_info) %>%
  mutate(Grade = as.integer(Grade))

tot_quizzes

Video watch rate by grade and subject

df_videos %>%
  mutate(perc_watched = round(100*`Watch Duration` / `Video Duration`),0) %>%
  group_by(Grade, Subject) %>%
  summarise(num_vids =  n(),
            Avg_perc_watched = round(mean(perc_watched, na.rm = TRUE),0)) %>%
  ggplot(mapping = aes(x = Grade, y = Subject))+
  geom_tile(aes(fill = Avg_perc_watched)) +
  scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
                         midpoint = 30, space = "Lab", na.value = "grey50", 
                         guide = "colourbar", aesthetics = "fill") +
  theme_bw()

## Math is super low other than grade nine - what’s different? ## 11/12 watch rate is low in general - do 11/12 care less about videos and more about problems? ## Video content audit - Watch % by chapters

df_videos %>%
  mutate(perc_watched = round(100*`Watch Duration` / `Video Duration`),0) %>%
  group_by(Grade, Subject, ChNo) %>%
  summarise(Avg_perc_watched = round(mean(perc_watched, na.rm = TRUE),0)) %>%
         #   Num_vids = n()) %>%
  ggplot(mapping = aes(x = Subject, y = ChNo))+
  geom_tile(aes(fill = Avg_perc_watched)) +
  facet_grid(cols = vars(Grade)) +
  theme_bw() +
  scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
                         midpoint = 30, space = "Lab", na.value = "grey50", 
                         guide = "colourbar", aesthetics = "fill")

## @Garima - What’s going on with the specific green chapters - for instance 10.M.15 - maybe kids come for revision and then see the last chapter. Ignoring this - 11.C.10 - C12 (mohina popular on youtube) and 11.P.11 - 12 (Ashwin BB Radiation) seem to be doing well. 12.P.17-19 seems surprisingly good. PSV’s have much much higher watch rate than videos (Pritesh add chart) - Also see (Hindi) vs. not (Hindi)

content_audit <- df_videos %>%
    mutate(Video.Duration = as.integer(`Video Duration`)) %>%
  mutate(perc_watched = round(100*`Watch Duration` / `Video Duration`),0) %>%
  group_by(Grade, Subject, ChNo, `Chapter Name`, `Video Name`, Video.Duration) %>%
  summarise(Avg_perc_watched = round(mean(perc_watched, na.rm = TRUE),0),
            Num_video_opened = n()) 
## Warning: NAs introduced by coercion to integer range
content_audit

Video usage by chapters across months - Fix scale to add more granularity below 500 num vids

knitr::opts_chunk$set(message = FALSE, warning = FALSE)

p <- df_videos %>%
  filter(Grade == 11) %>%
  mutate(Month = as.yearmon(`Time Finished`, "%m/%Y")) %>%
  group_by(Subject, ChNo, Month) %>%
  summarise(num_vids =  n()) %>%
  #filter(num_vids < 500) %>%
  ggplot(mapping = aes(x = Subject, y = ChNo, fill = num_vids))+
  geom_tile() +
  scale_fill_viridis_c() +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
 # facet_grid( shrink = FALSE, space = 'free') +
  facet_wrap(vars(Month), ncol = 3)  ## facet grid prints all in 1 column
  #labs(title = 'Month: {closest_state}') +
  #transition_states(Month, transition_length = 3, state_length = 6) +
  #ease_aes('linear')

p

Quiz usage by grade and subject

Class 10 is the only grade with meaningful quiz usage. Assuming again pushed by YouTube. Need to evaluate the utility of quiz feature in its current form more deeply

Replace with % completed

df_quizzes %>%
  mutate(Month = as.yearmon(`Time Finished`, "%m/%Y")) %>%
  group_by(Grade, Subject, Month) %>%
  summarise(num_quizzes =  n()) %>%
  ggplot(mapping = aes(x = Grade, y = Subject))+
  geom_tile(aes(fill = num_quizzes)) +
  #scale_fill_viridis_c() +
  theme_bw() +
  scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
                         midpoint = 500, space = "Lab", na.value = "grey50", 
                         guide = "colourbar", aesthetics = "fill")

 ## theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
## panel.background = element_blank(), axis.line = element_line(colour = "black")) 
  ## labs(title = 'Month: {closest_state}') +
  ## transition_states(Month, transition_length = 3, state_length = 6) +
  ## ease_aes('linear')

Quiz usage by chapters

df_quizzes %>%
  #filter(Grade == 11) %>%
  #filter(as.Date(`Time Finished`) > "2018-09-01") %>%
  #filter(as.Date(`Time Finished`) < "2018-12-01") %>%
  #mutate(Month = as.yearmon(`Time Finished`, "%m/%Y")) %>%
  #group_by(Subject, ChNo, Month) %>%
  group_by(Grade, Subject, ChNo) %>%
  summarise(num_quizzes =  n()) %>%
  ggplot(mapping = aes(x = Subject, y = ChNo))+
  geom_tile(aes(fill = num_quizzes)) +
 # scale_fill_viridis_c() +
  scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
                         midpoint = 2000, space = "Lab", na.value = "grey50", 
                         guide = "colourbar", aesthetics = "fill") +
 
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  facet_wrap(vars(Grade), ncol = 4, nrow = 3)  ## facet grid prints all in 1 column

  #labs(title = 'Month: {closest_state}') +
  #transition_states(Month, transition_length = 3,state_length = 6) +
  #ease_aes('linear')

Quiz usage is pretty high for 1st few chapters of 10th and 11th, but pretty low otherwise

Seasonality of usage - By Month, Day of Week, Time of Day, etc

MoM usage - Videos & Quizzes

mom_videos <- df_videos %>%
  mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
         DoW = wday(`Time Finished`)
         ) %>%
  group_by(Month) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x= reorder(as.character(Month), Month), y=Count)) +
  geom_bar(stat = "identity", fill = "steelblue", color = "black") +
  geom_text(aes(label = Count), hjust = 1, size = 4, color = "white") +
  theme_minimal()+
  labs(title = "# Videos viewed", x= "Month", y= "Count") +
  coord_flip()

### MoM usage - Quizzes
mom_quizzes <- df_quizzes %>%
  mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
         DoW = wday(`Time Finished`)
         ) %>%
  group_by(Month) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x= reorder(as.character(Month), Month), y=Count)) +
  geom_bar(stat = "identity", fill = "orange", color = "black") +
  geom_text(aes(label = Count), hjust = 1, size = 4, color = "black") +
  theme_minimal()+
  labs(title = "# Quizzes completed", x= "Month", y= "Count") +
  coord_flip()

grid.arrange(mom_quizzes, mom_videos, ncol = 2)

DoW - Video & Quiz Usage

dow_videos <- df_videos %>%
  mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
         DoW = wday(`Time Finished`, label = TRUE)
         ) %>%
  group_by(DoW) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x= DoW, y=Count)) +
  geom_bar(stat = "identity", fill = "steelblue", color = "black") +
  geom_text(aes(label = Count), hjust = 1, size = 4, color = "white") +
  theme_minimal()+
  labs(title = "# Videos viewed", x= "Day Of Week", y= "Count") +
  coord_flip()

dow_quizzes <- df_quizzes %>%
  mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
         DoW = wday(`Time Finished`, label = TRUE)
         ) %>%
  group_by(DoW) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x= DoW, y=Count)) +
  geom_bar(stat = "identity", fill = "orange", color = "black") +
  geom_text(aes(label = Count), hjust = 1, size = 4, color = "black") +
  theme_minimal()+
  labs(title = "# Quizzes completed", x= "Day Of Week", y= "Count") +
  coord_flip()

grid.arrange(dow_quizzes, dow_videos, ncol = 2)

MoM distinct users - Videos & Quizzes

mom_distinct_videos <- df_videos %>%
  mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
         DoW = wday(`Time Finished`)
         ) %>%
  group_by(Month) %>%
  summarise(DistinctUsers = n_distinct(DistinctID)) %>%
  ggplot(aes(x= reorder(as.character(Month), Month), y=DistinctUsers)) +
  geom_bar(stat = "identity", fill = "steelblue", color = "black") +
  geom_text(aes(label = DistinctUsers), hjust = 1, size = 4, color = "white") +
  theme_minimal()+
  labs(title = "# Distinct Users watching videos", x= "Month", y= "Count") +
  coord_flip()

mom_distinct_quizzes <- df_quizzes %>%
  mutate(Month = as.yearmon(`Time Finished`, "%m/%Y"),
         DoW = wday(`Time Finished`)
         ) %>%
  group_by(Month) %>%
  summarise(DistinctUsers = n_distinct(DistinctID)) %>%
  ggplot(aes(x= reorder(as.character(Month), Month), y=DistinctUsers)) +
  geom_bar(stat = "identity", fill = "orange", color = "black") +
  geom_text(aes(label = DistinctUsers), hjust = 1, size = 4, color = "black") +
  theme_minimal()+
  labs(title = "# Distinct Users finishing Quizzes", x= "Month", y= "Count") +
  coord_flip()

grid.arrange(mom_distinct_quizzes, mom_distinct_videos, ncol = 2)

Hour Of Day usage - Videos

hour_distinct_videos <- df_videos %>%
  mutate(Hour = hour(`Time Finished`)) %>%
  group_by(Hour) %>%
  summarise(DistinctUsers = n_distinct(DistinctID)) %>%
  ggplot(aes(x= Hour, y=DistinctUsers)) +
  geom_bar(stat = "identity", fill = "steelblue", color = "black") +
  geom_text(aes(label = DistinctUsers), hjust = 1, size = 4, color = "white") +
  theme_minimal() +
  labs(title = "# Distinct Users watching videos", x= "Hour of the day", y= "DistinctUsers") +
  coord_flip()

### Hour Of Day usage - Quizzes
hour_distinct_quizzes <- df_quizzes %>%
  mutate(Hour = hour(`Time Finished`)) %>%
  group_by(Hour) %>%
  summarise(DistinctUsers = n_distinct(DistinctID)) %>%
  ggplot(aes(x= Hour, y=DistinctUsers)) +
  geom_bar(stat = "identity", fill = "orange", color = "black") +
  geom_text(aes(label = DistinctUsers), hjust = 1, size = 4, color = "black") +
  theme_minimal() +
  labs(title = "# Distinct Users giving quizzes", x= "Hour of the day", y= "DistinctUsers") +
  coord_flip()

### Hour of the day usage - Quizzes vs Videos
grid.arrange(hour_distinct_quizzes, hour_distinct_videos, ncol = 2)

Lifetime activity decay

Let’s find activity start and activity end date per user

video_life <- df_videos %>%
  group_by(DistinctID) %>%
  summarise(First_Video = min(`Time Finished`),
            Last_Video = max(`Time Finished`))


quiz_life <- df_quizzes %>%
  group_by(DistinctID) %>%
  summarise(First_Quiz = min(`Time Finished`),
            Last_Quiz = max(`Time Finished`))

### Join video and quiz lifetimes
activity_life <- full_join(video_life, quiz_life, by = "DistinctID")
### Common start and end dates for video / quizzes
activity_life %<>%
  mutate(First_Video_Quiz = if_else(is.na(First_Video), First_Quiz, 
                                    if_else(is.na(First_Quiz), First_Video, 
                                            if_else(First_Video < First_Quiz, First_Video, First_Quiz)
                                            )
                                    ),
         Last_Video_Quiz = if_else(is.na(Last_Video), Last_Quiz, 
                                    if_else(is.na(Last_Quiz), Last_Video, 
                                            if_else(Last_Video > Last_Quiz, Last_Video, Last_Quiz)
                                            )
                                    )
  )

activity_life %<>%
  mutate(Age = as.integer(difftime(Last_Video_Quiz, First_Video_Quiz, units = "days")))

#sample_n(activity_life, 10)

User Journey - Video + Quiz

At what stage / week do they watch video or quiz, etc

all_events <- df_quizzes %>%
  select(DistinctID, `Chapter Name`, `Time Finished`) %>%
  mutate(Event = 'Quiz')
  
tmp <- df_videos %>%
  select(DistinctID, `Chapter Name`, `Time Finished`) %>%
  mutate(Event = 'Video')

all_events %<>%
  bind_rows(tmp)

## Calculate the event age based on first activity of the user
all_events %<>%
  left_join(select(activity_life, DistinctID, First_Video_Quiz, Last_Video_Quiz, Age),
            by = "DistinctID") %>%
  mutate(Day_No = as.integer(difftime(`Time Finished`, First_Video_Quiz, units = "days"))) %>%
  mutate(Days_to_Die = as.integer(difftime(Last_Video_Quiz, `Time Finished`, units = "days")))

## Add the activity count serially per user
all_events %<>% 
  group_by(DistinctID) %>% 
  arrange(`Time Finished`, .by_group=TRUE) %>%  
  mutate(Event_No = row_number())

Histogram of user ages

activity_life %>%
  filter(Age > 0) %>%
  ggplot(aes(x = Age)) +
  geom_histogram(binwidth = 1) +
  coord_cartesian(xlim = c(-10,100)) +
  geom_vline(aes( xintercept = median(Age, na.rm = TRUE)))

Vertical line indicates median age, ~15 days or 2 weeks.

Distribution of user ages my monthly cohorts

activity_life %>%
  mutate(Start_Month = as.yearmon(First_Video_Quiz, "%m/%Y")) %>%
  filter(Age > 0) %>%
  ggplot(aes(x = Age, y = reorder(as.character(Start_Month), Start_Month))) +
  geom_density_ridges() +
  coord_cartesian(xlim = c(0, 30)) +
  labs(title = "Age distribution of app users by monthly cohorts", y = "Month", x = "Age (# days from first video / quiz to last video / quiz)")

Distribution of quiz user ages my monthly cohorts

quiz_ridge <- quiz_life %>%
  mutate(Start_Month = as.yearmon(First_Quiz, "%m/%Y")) %>%
  mutate(Age = as.integer(difftime(Last_Quiz, First_Quiz, units = "days"))) %>%
  filter(Age > 0) %>%
  ggplot(aes(x = Age, y = reorder(as.character(Start_Month), Start_Month))) +
  geom_density_ridges() +
  coord_cartesian(xlim = c(0, 365)) +
  labs(title = "Age distribution of quiz users", y = "Month", x = "Age (# days from first quiz to last quiz)")

quiz_ridge

Distribution of video user ages my monthly cohorts

video_ridge <- video_life %>%
  mutate(Start_Month = as.yearmon(First_Video, "%m/%Y")) %>%
  filter(as.Date(First_Video) > as.Date("2018-04-30")) %>%
  mutate(Age = as.integer(difftime(Last_Video, First_Video, units = "days"))) %>%
  filter(Age > 0) %>%
  ggplot(aes(x = Age, y = reorder(as.character(Start_Month), Start_Month))) +
  geom_density_ridges() +
  coord_cartesian(xlim = c(0, 365)) +
  labs(title = "Age distribution of video users", y = "Month", x = "Age (# days from first video to last video)")

video_ridge

Dead users

Let’s check students who have inactive in the last 3 months, and their credits balance

dead_users <- activity_life %>%
  filter(Last_Video_Quiz < as.Date("2018-10-31"))

dead_user_events <- all_events %>%
  filter(Last_Video_Quiz < as.Date("2018-10-31"))

Credits balance for the last video of dead users

dead_users %<>%
  left_join(df_videos, by = c("DistinctID"="DistinctID", "Last_Video"="Time Finished")) %>%
  select(-`Video Duration`, -`Topic Code`, -`Video Name`, -`Watch Duration`, -Subject, 
         -Grade, -ChNo, -TopicNo, -Curriculum, -`Chapter Name`)

### Let's check the credit balance of dead users who used the app for atleast 2 days
dead_users %>%
  filter(Age > 1) %>%
 # ggplot(aes(x=log10(1+`Credits Balance`)))+
  ggplot(aes(x=`Credits Balance`))+
  geom_histogram(binwidth = 10) +
  coord_cartesian(xlim = c(-20,400)) +
  geom_vline(aes( xintercept = median(`Credits Balance`, na.rm = TRUE)))

Let’s see what the dead users did in their last 7 days We should check if the video watch rates are low OR the quiz scores are low

#all_events %>%
dead_user_events %>%
  filter(Age > 60) %>%
  filter(Days_to_Die < 8) %>%
  group_by(Days_to_Die, Event) %>%
  summarise(count = n()) %>%
  #spread(key = Event, value = count) %>%
  ggplot() +
  geom_line(aes(x = Days_to_Die, y = count, colour = Event))

More than usual quizzes and videos were watched on the last day !! This points to intermittent usage by the dead users. Hence, we do not see a lot of daily activity for them. Let’s plot this weekly

#all_events %>%
dead_user_events %>%
  filter(Age > 200) %>%
  #filter(Days_to_Die < 8) %>%
  mutate(Weeks_to_Die = as.integer(Days_to_Die / 7)) %>%
  group_by(Weeks_to_Die, Event) %>%
  summarise(count = n()) %>%
  #spread(key = Event, value = count) %>%
  ggplot() +
  geom_line(aes(x = Weeks_to_Die, y = count, colour = Event))

This makes more sense ! There is a drop in user activity before they use the app for the last time

Let’s compare with the first 7 days

all_events %>%
  filter(Age > 100) %>%
  filter(Day_No < 100) %>%
  group_by(Day_No, Event) %>%
  summarise(count = n()) %>%
  #spread(key = Event, value = count) %>%
  ggplot() +
  geom_line(aes(x = Day_No, y = count, colour = Event))

Vertical line indicates the median no of credits for dead users, which is ~ 20

Across curriculum - which videos are being watched > or < 50%

df_videos %>%
  mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
  ggplot(aes(x = perc_watched)) +
  geom_histogram(binwidth = 5) +
  geom_vline(aes( xintercept = mean(perc_watched))) +
 # geom_text(label = mean(perc_watched)) +
  coord_cartesian(xlim = c(-5, 150))

At event level, mean watch rate is 28% (At video level,we have some videos with > 100% median watch rate. Mean is ~10% ) Compare with YouTube watch rates

Watch Duration

Avg watch duration for our videos, in secs, is ~13 secs

df_videos %>%
  ggplot(aes(as.integer(`Watch Duration`)))+
  geom_histogram(binwidth = 50) +
  geom_vline(aes( xintercept = median(`Watch Duration`))) +
  # geom_text(aes(label = mean(Median_Watch_Rate)))+
  coord_cartesian(xlim = c(0, 500))

Let’s check which video names of top and bottom performers

df_videos %>%
  mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
  group_by(Subject, Grade, ChNo, `Video Name`) %>%
  summarise(Median_Watch_Rate = round(median(perc_watched, na.rm = TRUE), 0)) %>%
  arrange(desc(Median_Watch_Rate)) 

User Journey - Videos

Are students watching videos sequantially ? or specifically something for revision ? We already saw students usually begin from Chapter 1 always

df_videos %>%
  left_join(video_life, by = "DistinctID") %>%
  mutate(Video_Age = as.integer(difftime(`Time Finished`, First_Video, units = "days"))) %>%
  filter(Grade == 11) %>%
  ggplot(aes(y = Video_Age, x = ChNo))+
  geom_boxplot()

Most students reach the end of the cuuriculum within 2-3 months from their start date

Let’s check if % video watched decays over time

boxplot_watch_rate <- df_videos %>%
  mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
  left_join(video_life, by = "DistinctID") %>%
  mutate(Event_Age = as.integer(difftime(`Time Finished`, First_Video, units = "days"))) %>%
  mutate(Video_Age = as.integer(difftime(Last_Video, First_Video, units = "days"))) %>%
  mutate(Week_No = as.integer(Event_Age / 7)) %>%
  filter(Video_Age > 60) %>%
  ggplot(aes(x = Week_No, y = perc_watched, group = Week_No))+
  geom_boxplot(notch=FALSE, fill="blue", alpha=0.2, outlier.size = 0.2) +
  coord_cartesian(ylim=c(0, 300))  ## Zoom Without removing data

boxplot_watch_rate

There is no decay, in general, of watch %age over time. In fact, the longer the students stay, they tend to watch videos for longer.

Let’s try a lineplot for each distinct user for their video watch rate

lineplot_watch_rate <- df_videos %>%
  mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
  left_join(video_life, by = "DistinctID") %>%
  mutate(Event_Age = as.integer(difftime(`Time Finished`, First_Video, units = "days"))) %>%
  mutate(Video_Age = as.integer(difftime(Last_Video, First_Video, units = "days"))) %>%
  mutate(Week_No = as.integer(Event_Age / 7)) %>%
  filter(Video_Age > 200) %>%
  ggplot(aes(x = Week_No, y = perc_watched, colour = DistinctID)) +
  geom_line() +
  theme(legend.position = "none") +
  coord_cartesian(ylim=c(0, 300))  ## Zoom Without removing data

lineplot_watch_rate

Wild variation between successive videos being watched

Let’s check the boxplot but only for dead users.

df_videos %>%
  semi_join(dead_users, by = "DistinctID") %>%
  mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
  left_join(video_life, by = "DistinctID") %>%
  mutate(Video_Age = as.integer(difftime(`Time Finished`, First_Video, units = "days"))) %>%
  mutate(Week_No = as.integer(Video_Age / 7)) %>%
  ggplot(aes(x = Week_No, y = perc_watched, group = Week_No))+
  geom_boxplot(notch=FALSE, fill="blue", alpha=0.2, outlier.size = 0.2) +
  coord_cartesian(ylim=c(0, 300))  ## Zoom Without removing data

We can see a decay in the outlier watch % over weeks, but the median doesn’t change much. So students dropping off after getting bored of videos doesn’t seem likely.

User Journey - Quizzes

Are students attempting quizzes sequentially ?

df_quizzes %>%
  left_join(quiz_life, by = "DistinctID") %>%
  mutate(Quiz_Age = as.integer(difftime(`Time Finished`, First_Quiz, units = "days"))) %>%
  filter(Grade == 11) %>%
  ggplot(aes(y = Quiz_Age, x = ChNo))+
  geom_boxplot()

Most students do attempt quizzes sequentially only.

Scatterplot of events, using Event age as timeframe

all_events %>%
  # filer(Age > 60) %>%
  group_by(DistinctID, Day_No) %>%
  summarise(Total_Videos = sum(Event == 'Video'),
            Total_Quizzes = sum(Event == 'Quiz')) %>%
  ggplot(aes(x = Total_Videos, y = Total_Quizzes)) +
  geom_point(aes(alpha = 0.001)) +
  theme_bw() +
  coord_cartesian(xlim = c(-1,100), ylim = c(-1,100)) 

 ## labs(title = '# Days since first activity : {closest_state}') +
 ## transition_states(Day_No, transition_length = 3, state_length = 6) +
 ## ease_aes('linear') 

Let’s break this down into weeks

all_events %>%
  mutate(Week_No = as.integer(Day_No / 7)) %>%
  filter(Week_No < 16) %>%
  group_by(DistinctID, Week_No) %>%
  summarise(Total_Videos = sum(Event == 'Video'),
            Total_Quizzes = sum(Event == 'Quiz')) %>%
  ggplot(aes(x = Total_Videos, y = Total_Quizzes)) +
  geom_point(aes(alpha = 0.001)) +
  theme_bw() +
  coord_cartesian(xlim = c(-1,100), ylim = c(-1,100)) +
  facet_wrap(vars(Week_No), nrow = 4, ncol = 4)

Not much difference from Weeks 3 to 7 !

Let’s check mean no of quizzes and videos per week

all_events %>%
  ## filter(Age > 200) %>%
  mutate(Week_No = as.integer(Day_No / 7)) %>%
  #filter(Week_No < 10) %>%
  group_by(Week_No, DistinctID) %>%
  summarise(Num_Videos = sum(Event == 'Video'),
            Num_Quizzes = sum(Event == 'Quiz')) %>%
  group_by(Week_No) %>%
  summarise(Mean_Videos = round(mean(Num_Videos, na.rm = TRUE),0),
            Mean_Quizzes = round(mean(Num_Quizzes, na.rm = TRUE),0))

Let’s plot this using boxplot to see the variation in no of events per week

all_events %>%
  ## filter(Age > 200) %>%
  mutate(Week_No = as.integer(Day_No / 7)) %>%
  #filter(Week_No < 10) %>%
  group_by(Week_No, DistinctID) %>%
  summarise(Num_Videos = sum(Event == 'Video'),
            Num_Quizzes = sum(Event == 'Quiz')) %>%
  ungroup() %>%
  #filter(Week_No < 30) %>%
  group_by(Week_No) %>%
  ggplot(aes(x = Week_No, y = Num_Videos, group = Week_No))+
  geom_boxplot(notch=FALSE, fill="red", alpha=0.2, outlier.size = 0.2) +
  coord_cartesian(ylim=c(0, 25))  ## Without removing data, just zoom

We see that the median no of videos are only 2-3 per week, but the outliers are pulling the mean upto 5. After Week 30, the sample size is pretty small so the outliers are pulling the median up.

Let’s check box plot for quizzes

all_events %>%
  #filter(Age > 200) %>%
  mutate(Week_No = as.integer(Day_No / 7)) %>%
  #filter(Week_No < 10) %>%
  group_by(Week_No, DistinctID) %>%
  summarise(Num_Videos = sum(Event == 'Video'),
            Num_Quizzes = sum(Event == 'Quiz')) %>%
  ungroup() %>%
  #filter(Week_No < 30) %>%
  group_by(Week_No) %>%
  ggplot(aes(x = Week_No, y = Num_Quizzes, group = Week_No))+
  geom_boxplot(notch=FALSE, fill="blue", alpha=0.2, outlier.size = 0.2) +
  coord_cartesian(ylim=c(0, 5))  ## Without removing data, just zoom

Most students attempt 0-1 (median 0, men 1) quizzes per week, and there seems to be no variation by week, except the decay of outliers. Makes sense though, as each topic has roughly 4-5 videos and only 1 quiz.

Let’s group these activities by user and check how their activity states vary per week

weekly_user_activity <- all_events %>%
  filter(Age > 300) %>%
  mutate(Week_No = as.integer(Day_No / 7)) %>%
  #filter(Week_No < 10) %>%
  group_by(Week_No, DistinctID) %>%
  summarise(Activity_Level = as.integer(n() / 10),
            IsActive = if_else(n() > 10, 1, 0),
            Total_Activities_Till_Date = max(Event_No)) %>%
  ungroup() 

weekly_user_activity %>%
  #mutate(Cumulative_Activities = gr)
  ggplot(aes(x = Week_No, y = IsActive, colour = DistinctID, alpha = 0.1)) +
  geom_line() +
  theme(legend.position = "none") 

Activity states change very rapidly week on week. Let’s check their cumulative activity sum by week

all_events %>%
  filter(Age > 300) %>%
  filter(Day_No < 30) %>%
  mutate(Week_No = as.integer(Day_No / 7)) %>%
  #filter(Week_No < 10) %>%
  group_by(Week_No, DistinctID) %>%
  summarise(Activity_Level = as.integer(n() / 10),
            IsActive = if_else(n() > 10, 1, 0),
            Total_Activities_Till_Date = max(Event_No)) %>%

  ggplot(aes(x = Week_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
  geom_line() +
  theme(legend.position = "none")

Looking at cumulative activity plots for just the videos with a cutoff for qualifying as an event

df_videos %>%
  mutate(perc_watched = 100 * `Watch Duration` / `Video Duration`) %>%
  filter(perc_watched > 90) %>%
  filter(`Watch Duration` > 20) %>%
  left_join(all_events, by = c("DistinctID", "Time Finished")) %>%
 # mutate(Week_No = as.integer(Day_No / 7)) %>%
  filter(Day_No < 30) %>%
  group_by(Day_No, DistinctID) %>%
  summarise(Activity_Level = as.integer(n() / 10),
            IsActive = if_else(n() > 10, 1, 0),
            Total_Activities_Till_Date = max(Event_No)) %>%
  ggplot(aes(x = Day_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
  geom_line() +
  theme(legend.position = "none")

Let’s check this graph by day

all_events %>%
  filter(Age > 300) %>%
 # mutate(Week_No = as.integer(Day_No / 7)) %>%
  group_by(Day_No, DistinctID) %>%
  summarise(Total_Activities_Till_Date = max(Event_No)) %>%
  ggplot(aes(x = Day_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
  geom_line() +
  theme(legend.position = "none")

Some users stay dormant for a long time and come back after 3-6 months.

Let’s check this separately for videos and quizzes

cumulative_video_plot <- all_events %>%
  filter(Age > 250) %>%
  filter(Event == "Video") %>%
 # mutate(Week_No = as.integer(Day_No / 7)) %>%
  group_by(Day_No, DistinctID) %>%
  summarise(Total_Activities_Till_Date = max(Event_No)) %>%
  ggplot(aes(x = Day_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
  geom_line() +
  ggtitle("Only Videos") +
  theme(legend.position = "none")

cumulative_quiz_plot <- all_events %>%
  filter(Age > 250) %>%
  filter(Event == "Quiz") %>%
 # mutate(Week_No = as.integer(Day_No / 7)) %>%
  group_by(Day_No, DistinctID) %>%
  summarise(Total_Activities_Till_Date = max(Event_No)) %>%
  ggplot(aes(x = Day_No, y = Total_Activities_Till_Date, colour = DistinctID, alpha = 0.1)) +
  geom_line() +
  ggtitle("Only Quizzes") +
  theme(legend.position = "none")

grid.arrange(cumulative_video_plot, cumulative_quiz_plot)

Let’s see per person, how many quizzes have been attempted per chapter

quiz_usage <- df_quizzes %>%
  mutate(perc_score = round((100* `Correct Questions` / `Total Questions`),0)) %>%
  group_by(DistinctID, `Chapter Code`) %>%
  summarise(finished = n(),
            avg_score = mean(perc_score, na.rm = TRUE)) %>%
  left_join(tot_quizzes, by = c("Chapter Code" = "Parent Code")) %>%  #Total quizzes available chapter
  mutate(usage = round((100* finished / num_quiz),0))

sample_frac(quiz_usage)

Difficulty of quizzes across chapters

Were the quizzes too easy ? Avg score > 50%

df_quizzes %>%
  mutate(perc_score = round((100* `Correct Questions` / `Total Questions`),0)) %>%
  group_by(Grade, Subject, ChNo) %>%
  summarise(num_users =  n_distinct(DistinctID),
            Avg_Score = mean(perc_score, na.rm = TRUE)
            ) %>%
  ggplot(mapping = aes(x = Subject, y = ChNo))+
  geom_tile(aes(fill = Avg_Score)) +
  facet_grid(cols = vars(Grade)) +
  theme_bw() +
  scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
                         midpoint = 50, space = "Lab", na.value = "grey50", 
                         guide = "colourbar", aesthetics = "fill")

Quiz completed per chapters

Currently not accurate as multiple events can be logged for the same quiz and same user

quiz_usage %>%
  group_by(Grade, Subject, ChNo) %>%
  summarise(Median_perc_quizzes_completed = median(usage, na.rm = TRUE)
            ) %>%
  ggplot(mapping = aes(x = Subject, y = ChNo))+
  geom_tile(aes(fill = Median_perc_quizzes_completed)) +
  facet_grid(cols = vars(Grade)) +
  theme_bw() +
  scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
                         midpoint = 50, space = "Lab", na.value = "grey50", 
                         guide = "colourbar", aesthetics = "fill")

Let’s check distribution of users by their % quiz completed per chapter

quiz_usage %>%
  group_by(DistinctID) %>%
  summarise(Mean_perc_quizzes_completed_per_chapter = mean(usage, na.rm = TRUE),
            num_chapters_attempted = n()) %>%
  filter(num_chapters_attempted > 10) %>%
  ggplot() +
  geom_histogram(aes(x=Mean_perc_quizzes_completed_per_chapter), binwidth = 5) +
  geom_vline(aes( xintercept = mean(Mean_perc_quizzes_completed_per_chapter))) +
  # geom_text
  coord_cartesian(xlim=c(-5,120))

Cohorts of Users by campaign sources

au_signups <- left_join(successful_signups, au, by = "DistinctID")
sample_n(au_signups, 20)

Summary of app activity by campaign sources

au_signups %>%
  group_by(UTM.Source) %>%
  summarise(Count = n(),
            Avg_Videos_Watched = mean(NumVideos, na.rm = TRUE),
            Avg_Num_Days_Active = mean(NumDaysActive, na.rm = TRUE),
            Avg_Quizzes_Completed = mean(NumQuizzes, na.rm = TRUE)) %>%
  arrange(desc(Count))
  #filter(Count > 200) %>%
  #filter(! UTM.Source %in% c("NA", "(not set)")) %>%
  #ggplot() +
  #geom_barplot(aes(x = UTM.Source))

Let’s check a boxplot for the most common sources

p1 <- au_signups %>%
  filter( UTM.Source %in% c("google-play", "YouTube", "avanti.in", "Google Form", "Print Media")) %>%
  ggplot() +
  geom_boxplot(aes(x = UTM.Source, y = NumQuizzes)) ## change y to NumVideos or NumDaysActive

p2 <- au_signups %>%
  filter( UTM.Source %in% c("google-play", "YouTube", "avanti.in", "Google Form", "Print Media")) %>%
  ggplot() +
  geom_boxplot(aes(x = UTM.Source, y = NumVideos)) ## change y to NumVideos or NumDaysActive

p3 <- au_signups %>%
  filter( UTM.Source %in% c("google-play", "YouTube", "avanti.in", "Google Form", "Print Media")) %>%
  ggplot() +
  geom_boxplot(aes(x = UTM.Source, y = NumDaysActive)) ## change y to NumVideos or NumDaysActive

grid.arrange(p1, p2, p3, nrow = 3)

There is no major difference between different students from different campaigns !

Why users dropped off ?

## Time to run out of credits - Credit Balance decay
## Left NOT due to credit balance
## Not enough quiz questions ? - # of chapters with > 50% quizzes attempted
## Who leaves in Wk1 ?
## Who leaves in Wk2 ?
## Who leaves in Wk3 ?
## Who leaves in Wk4 ?

How are they starting to watch - Rec video / Rec quiz or Learn page ?

Are quizzes engaging ?

Qs attempted / Total Quiz length for quizzes across chapters

incomplete_quizzes <- read_csv("incomplete_quizzes.csv")
glimpse(incomplete_quizzes)
## Observations: 9,814
## Variables: 12
## $ `Chapter Code`      <chr> "CHM-11-JEE-02", "CHM-11-JEE-02", "CHM-11-JE…
## $ `Chapter Name`      <chr> "Atomic Structure", "Atomic Structure", "Per…
## $ `Correct Questions` <int> 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 0, 0, 1,…
## $ `Credits Balance`   <int> 20, 20, 40, 40, 40, 20, 40, 10, 12, 20, 40, …
## $ DistinctID          <chr> "bddbbc80-d105-42a5-a15d-e64db5133ad1", "bdd…
## $ Name                <chr> "Prachi", "Prachi", "Prachi", "Prachi", "Ram…
## $ `Quiz Name`         <chr> "Practice Quiz 1", "Practice Quiz 2", "Early…
## $ Subject             <chr> "Chemistry", "Chemistry", "Chemistry", "Chem…
## $ `Time Finished`     <dttm> 2018-07-28 14:27:21, 2018-07-28 14:29:19, 2…
## $ `Time Spent(s)`     <int> 115, 200, 271, 252, 126, 52, 58, 31, 168, 98…
## $ `Total Questions`   <int> 5, 5, 7, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 4,…
## $ UserID              <chr> "2013Prachipant@gmail.com", "2013Prachipant@…

Remove rows which got completed later

Anti join with the quiz_scores by DistinctID, Chapter Code & Quiz Name

incomplete_quiz_only <- anti_join(incomplete_quizzes, df_quizzes, 
                                  by = c("DistinctID", "Chapter Code", "Quiz Name"))

#sample_n(incomplete_quiz_only, 20)

Let’s see if there is a difference between

chr_mat4 = str_split(incomplete_quiz_only$`Chapter Code`, "-", n = 4, simplify = TRUE)
colnames(chr_mat4) <- c("Subject", "Grade", "Curriculum", "ChNo")
chapter_code_info <- as_tibble(chr_mat4)

incomplete_quiz_only %<>%
  bind_cols(chapter_code_info) %>%
  mutate(Grade = as.integer(Grade))

incomplete_quiz_only %>%
  mutate(perc_score = round((100* `Correct Questions` / `Total Questions`),0)) %>%
  group_by(Grade, Subject, ChNo) %>%
  summarise(num_users =  n_distinct(DistinctID),
            Avg_Score = mean(perc_score, na.rm = TRUE)
            ) %>%
  ggplot(mapping = aes(x = Subject, y = ChNo))+
  geom_tile(aes(fill = Avg_Score)) +
  facet_grid(cols = vars(Grade)) +
  theme_bw() +
  scale_fill_gradient2(low = muted("red"), mid = "yellow", high = muted("green"),
                         midpoint = 50, space = "Lab", na.value = "grey50", 
                         guide = "colourbar", aesthetics = "fill")

Clearly, students who do not complete quizzes do so due to poor scores on Questions attempted thus far. But, only 6413 out of 52456 remain unfinished in the last 1 year.

Let’s check quiz completion by the difficulty levels of the quiz

quizzes <- read.delim(textConnection(getURL(
  "https://avanti-dump.s3-ap-southeast-1.amazonaws.com/quiz-resp-dumps/quizzes.tsv")),
  header = FALSE, stringsAsFactors = FALSE)

colnames(quizzes) <- c("LMSTestID", "CMSQuizID", "ParentType", "ParentID", "QuizName", "Tags")

sample_n(quizzes, 10)

We have 4 kinds of chapter quizzes and then some topic quizzes. Let’s add quiz type

quizzes %<>% mutate(Quiz_Type = if_else(ParentType == "Topic", "TopicQuiz", Tags))

sample_n(quizzes, 10)

Now let’s check responses

quiz_responses <- read.delim(textConnection(
  getURL("https://avanti-dump.s3-ap-southeast-1.amazonaws.com/quiz-resp-dumps/responses.tsv")),
  header = FALSE, stringsAsFactors = FALSE)

colnames(quiz_responses) <- c("Timestamp","LMSTestID","StudentID", "ProblemID", "Response")

sample_n(quiz_responses, 10)

Let’s check count of responses per question on Quizzes on each type

response_count <- quiz_responses %>% group_by(LMSTestID, Response) %>% summarise(Count = n()) %>% spread(Response, Count)

quizzes %<>% left_join(response_count, by = "LMSTestID")

sample_n(quizzes, 10)

Let’s plot to see if there’s a difference

stack_bar_plot <- quizzes %>%
  filter(! Quiz_Type == "TopicQuiz") %>%
  gather(Correct, Incorrect, `Not Visited`, Skipped, Attempted, key = "Response", value = "ProblemCounts") %>%
  group_by(Quiz_Type, Response) %>%
  summarise(student_problem_counts = sum(ProblemCounts, na.rm = TRUE)) %>%
  ggplot(aes(x = Quiz_Type, y = student_problem_counts, fill = Response)) +
  geom_bar(stat = "identity")

stack_bar_plot

Let’s see a 100% stacked bar chart

perc_stacked_bar_plt <- quizzes %>%
  filter(! Quiz_Type == "TopicQuiz") %>%
  gather(Correct, Incorrect, `Not Visited`, Skipped, Attempted, key = "Response", value = "ProblemCounts") %>%
  group_by(Quiz_Type, Response) %>%
  summarise(student_problem_counts = sum(ProblemCounts, na.rm = TRUE)) %>%
  ggplot(aes(x = Quiz_Type, y = student_problem_counts, fill = Response)) +
  geom_bar(position = "fill", stat = "identity") +
  scale_y_continuous(labels = scales::percent_format())

perc_stacked_bar_plt

App adoption by ALC centre

alc <- read_csv("active_users_since.csv")   ## Active users since April-1-2019
head(alc)
dup <- alc %>%
     filter(! is.na(`Centre Name`)) %>%
     filter( `Centre Name` %in% c("Avanti Gurukul Mobile App", "Online Students - Pilot", 
                                   "Online Students - Paid", "Online JEE Test Series", "Demo Centre 2017")) 
alc %<>%
  filter(! is.na(`Centre Name`)) %>%
  filter(! `Centre Name` %in% c("Avanti Gurukul Mobile App", "Online Students - Pilot", 
                         "Online Students - Paid", "Online JEE Test Series", "Demo Centre 2017")) 

alc %>%
  group_by(`Centre Name`) %>%
  summarise(n = n()) %>%
  filter(n > 4) %>%
  ggplot(aes(x=reorder(`Centre Name`, n), y=n)) +
  geom_bar(stat = "identity", fill = "orange", color = "black") +
  geom_text(aes(label = n), hjust = 1, size = 4, color = "black") +
  theme_minimal()+
  labs(title = "Centre-wise distribution (10+ students)", x= "Centre", y= "Count") +
  coord_flip()

Other than these, there can be kids who are incorrectly tagged to AGM but can be matched by Phone Nos

Match dup with studlist

studlist <- read.delim(
  textConnection(
  getURL("https://s3-ap-southeast-1.amazonaws.com/avanti-dump/dashboard-dumps/studentlist/student_list.tsv")),
  header = TRUE, stringsAsFactors = FALSE)

dup %<>%
  mutate(Phone = as.character(Phone))

ph_no_match <- inner_join(dup, studlist, by = c("Phone" = "Phone"))
parent_ph_no_match <- inner_join(dup, studlist, by = c("Phone" = "Parent.s.Phone"))