Interview Questions in R

LeetCode is a great source of sample problems for Facebook Data Scientist interview prep, with one major caveat. If you select R for your interview, there’s no way to validate your code on the LeetCode platform. Each question does however provide the SQL Schema, so one relatively quick/easy way to practice is to run that schema on your own DB instance (say using Amazon RDS), and connect to it with R (and the RMySQL package). After working through a few problems, I thought I’d post them here in case they can help someone. I’m not saying these are the best/prettiest solutions, but this is what I came up with.

Question: Calculate the overall acceptance rate of friend requests, also take heed of duplicate requests (someone requesting multiple times)

df1 <- dbReadTable(con,"friend_request")
df2 <- dbReadTable(con,"request_accepted")

rate <- nrow(df2 %>% distinct(requester_id, accepter_id)) / nrow(df1 %>% distinct(sender_id, send_to_id))
rate
## [1] 0.8

Can you summarize the acceptance rate by month?

df1$month <- paste0(year(df1$request_date), "-", month(df1$request_date))
df2$month <- paste0(year(df2$accept_date), "-", month(df2$accept_date))

df1_months <- df1 %>% distinct(sender_id, send_to_id, .keep_all = TRUE) %>% group_by(month) %>% summarise(n=n())
df2_months <- df2 %>% distinct(requester_id, accepter_id, .keep_all = TRUE) %>% group_by(month) %>% summarise(n=n())

month_summary <- df1_months %>% left_join(df2_months, by="month") %>% mutate(n.y/n.x)
month_summary
## # A tibble: 1 x 4
##   month    n.x   n.y `n.y/n.x`
##   <chr>  <int> <int>     <dbl>
## 1 2016-6     5     4       0.8

Question: Based on the request_accepted table above, Who has the most friends?

accepted <- df2 %>% distinct(requester_id, accepter_id, .keep_all = TRUE)

requestor <- accepted %>% group_by(requester_id) %>% summarize(n=n())
acceptor <- accepted %>% group_by(accepter_id) %>% summarize(n=n())

joined <- acceptor %>% full_join(requestor, by=c("accepter_id"="requester_id")) %>% replace(is.na(.), 0) %>% mutate(n.x+n.y) %>% arrange(desc(n.x+n.y))
joined[,1]
## # A tibble: 4 x 1
##   accepter_id
##         <int>
## 1           3
## 2           2
## 3           1
## 4           4

Question: Based on a follow table with two columns: followee, follower, get the amount of each follower’s follower if he/she has one.

follow <- dbReadTable(con, "follow")

followers <- follow %>% distinct(follower)

filtered <- follow %>% filter(followee %in% followers$follower) %>% group_by(followee) %>% summarise(num=n()) %>% arrange(followee)
filtered
## # A tibble: 2 x 2
##   followee   num
##   <chr>    <int>
## 1 B            2
## 2 D            1

Question: Find the average for daily percentage of posts that got removed after being reported as spam, rounded to 2 decimal places.

actions <- dbReadTable(con, "Actions")
removals <- dbReadTable(con, "Removals")
results <- actions %>% left_join(removals, by="post_id") 

results$removed <- ifelse(!(is.na(results$remove_date)),1,0)
results <- results %>% filter(extra=="spam") %>% group_by(action_date) %>% summarise(n=n(), removed=sum(removed))
results$prop <- round(results$removed/results$n,2)

round(mean(results$prop),2)
## [1] 0.75

Get the number of posts reported as spam yesterday

results <- actions %>% filter(action_date=="2019-07-04", action=="report") %>% group_by(extra) %>% summarize(n=n())
results
## # A tibble: 1 x 2
##   extra     n
##   <chr> <int>
## 1 spam      2

Question: Given a table attendance_events and a table all_students, what percent of students attend school on their birthday?

#Note that I build a couple data frames here rather than load the from my RDS instance
student_id <- c(1,1,1,2,2,2,3,3,3)
attendance <- c(1,1,0,1,1,1,0,1,1)
attendance_events <- data.frame(cbind(student_id, attendance))
attendance_events$date <- c(today(),today()-1,today()-2,today(),today()-1,today()-2,today(),today()-1,today()-2)

student_id <- c(1,2,3)
grade_level <- c(6,6,7)
all_students <- data.frame(cbind(student_id,grade_level))
all_students$dob <- c(today()-2, today()-1, today())

#actual solution begins here

attendance_events$m_d <- paste0(month(attendance_events$date), "-", day(attendance_events$date))
all_students$m_d <- paste0(month(all_students$dob), "-", day(all_students$dob))

joined <- all_students %>% left_join(attendance_events, by="student_id")

joined$attend_bday <- ifelse((joined$m_d.y==joined$m_d.x & joined$attendance==1),1,0)

sum(joined$attend_bday)/nrow(all_students)
## [1] 0.3333333

##Question: Given two tables, recommend a page based on pages your friends liked. Don’t reco pages you’ve already liked.

# Again here I build up the data frames first rather than load from the DB instance
##Table of users and their friends
user_id <- c(1, 1, 1, 2, 2, 3, 3, 3, 3)
fr_id <- c(10, 20, 30, 130, 110, 90, 80, 70, 30)
df1 <- data.frame(user_id, fr_id)

##Table of users and pages they've liked
user_id <- c(1, 1, 1, 20, 20, 30, 30, 30, 30, 10, 40, 50, 60)
page_id <- c(100, 200, 300, 1300, 1100, 1300, 800, 700, 300, 1300, 100, 1300, 300)
df2 = data.frame(user_id, page_id)

#actual solution starts here

#filter users tbl on my ID
df1 <- df1 %>% filter(user_id=="1")
#get my likes so I don't reco a page I've already liked
my_likes <- df2 %>% filter(user_id == "1") %>% distinct()
#filter my ID AND pages I've liked OUT of the pages liked tbl
df2 <- df2 %>% filter(user_id!="1", !(page_id %in% my_likes[,2]))

#reco
joined <- df1 %>% left_join(df2, by=c("fr_id"="user_id")) %>% group_by(page_id) %>% summarize(n=n()) %>% arrange(desc(n))
reco_page <- joined[1,1]

Given a table of employees and their managers, how can you query who reports to whom?

# This is a self-join problem

staffs <- dbReadTable(con, "staffs")

joined <- staffs %>% inner_join(.,., by=c("manager_id"="staff_id")) %>% rename(EmployeeName=first_name.x, ManagerName=first_name.y) %>% select(EmployeeName, ManagerName)