LeetCode is a great source of sample problems for Facebook Data Scientist interview prep, with one major caveat. If you select R for your interview, there’s no way to validate your code on the LeetCode platform. Each question does however provide the SQL Schema, so one relatively quick/easy way to practice is to run that schema on your own DB instance (say using Amazon RDS), and connect to it with R (and the RMySQL package). After working through a few problems, I thought I’d post them here in case they can help someone. I’m not saying these are the best/prettiest solutions, but this is what I came up with.
df1 <- dbReadTable(con,"friend_request")
df2 <- dbReadTable(con,"request_accepted")
rate <- nrow(df2 %>% distinct(requester_id, accepter_id)) / nrow(df1 %>% distinct(sender_id, send_to_id))
rate
## [1] 0.8
df1$month <- paste0(year(df1$request_date), "-", month(df1$request_date))
df2$month <- paste0(year(df2$accept_date), "-", month(df2$accept_date))
df1_months <- df1 %>% distinct(sender_id, send_to_id, .keep_all = TRUE) %>% group_by(month) %>% summarise(n=n())
df2_months <- df2 %>% distinct(requester_id, accepter_id, .keep_all = TRUE) %>% group_by(month) %>% summarise(n=n())
month_summary <- df1_months %>% left_join(df2_months, by="month") %>% mutate(n.y/n.x)
month_summary
## # A tibble: 1 x 4
## month n.x n.y `n.y/n.x`
## <chr> <int> <int> <dbl>
## 1 2016-6 5 4 0.8
accepted <- df2 %>% distinct(requester_id, accepter_id, .keep_all = TRUE)
requestor <- accepted %>% group_by(requester_id) %>% summarize(n=n())
acceptor <- accepted %>% group_by(accepter_id) %>% summarize(n=n())
joined <- acceptor %>% full_join(requestor, by=c("accepter_id"="requester_id")) %>% replace(is.na(.), 0) %>% mutate(n.x+n.y) %>% arrange(desc(n.x+n.y))
joined[,1]
## # A tibble: 4 x 1
## accepter_id
## <int>
## 1 3
## 2 2
## 3 1
## 4 4
follow <- dbReadTable(con, "follow")
followers <- follow %>% distinct(follower)
filtered <- follow %>% filter(followee %in% followers$follower) %>% group_by(followee) %>% summarise(num=n()) %>% arrange(followee)
filtered
## # A tibble: 2 x 2
## followee num
## <chr> <int>
## 1 B 2
## 2 D 1
actions <- dbReadTable(con, "Actions")
removals <- dbReadTable(con, "Removals")
results <- actions %>% left_join(removals, by="post_id")
results$removed <- ifelse(!(is.na(results$remove_date)),1,0)
results <- results %>% filter(extra=="spam") %>% group_by(action_date) %>% summarise(n=n(), removed=sum(removed))
results$prop <- round(results$removed/results$n,2)
round(mean(results$prop),2)
## [1] 0.75
results <- actions %>% filter(action_date=="2019-07-04", action=="report") %>% group_by(extra) %>% summarize(n=n())
results
## # A tibble: 1 x 2
## extra n
## <chr> <int>
## 1 spam 2
#Note that I build a couple data frames here rather than load the from my RDS instance
student_id <- c(1,1,1,2,2,2,3,3,3)
attendance <- c(1,1,0,1,1,1,0,1,1)
attendance_events <- data.frame(cbind(student_id, attendance))
attendance_events$date <- c(today(),today()-1,today()-2,today(),today()-1,today()-2,today(),today()-1,today()-2)
student_id <- c(1,2,3)
grade_level <- c(6,6,7)
all_students <- data.frame(cbind(student_id,grade_level))
all_students$dob <- c(today()-2, today()-1, today())
#actual solution begins here
attendance_events$m_d <- paste0(month(attendance_events$date), "-", day(attendance_events$date))
all_students$m_d <- paste0(month(all_students$dob), "-", day(all_students$dob))
joined <- all_students %>% left_join(attendance_events, by="student_id")
joined$attend_bday <- ifelse((joined$m_d.y==joined$m_d.x & joined$attendance==1),1,0)
sum(joined$attend_bday)/nrow(all_students)
## [1] 0.3333333
##Question: Given two tables, recommend a page based on pages your friends liked. Don’t reco pages you’ve already liked.
# Again here I build up the data frames first rather than load from the DB instance
##Table of users and their friends
user_id <- c(1, 1, 1, 2, 2, 3, 3, 3, 3)
fr_id <- c(10, 20, 30, 130, 110, 90, 80, 70, 30)
df1 <- data.frame(user_id, fr_id)
##Table of users and pages they've liked
user_id <- c(1, 1, 1, 20, 20, 30, 30, 30, 30, 10, 40, 50, 60)
page_id <- c(100, 200, 300, 1300, 1100, 1300, 800, 700, 300, 1300, 100, 1300, 300)
df2 = data.frame(user_id, page_id)
#actual solution starts here
#filter users tbl on my ID
df1 <- df1 %>% filter(user_id=="1")
#get my likes so I don't reco a page I've already liked
my_likes <- df2 %>% filter(user_id == "1") %>% distinct()
#filter my ID AND pages I've liked OUT of the pages liked tbl
df2 <- df2 %>% filter(user_id!="1", !(page_id %in% my_likes[,2]))
#reco
joined <- df1 %>% left_join(df2, by=c("fr_id"="user_id")) %>% group_by(page_id) %>% summarize(n=n()) %>% arrange(desc(n))
reco_page <- joined[1,1]
# This is a self-join problem
staffs <- dbReadTable(con, "staffs")
joined <- staffs %>% inner_join(.,., by=c("manager_id"="staff_id")) %>% rename(EmployeeName=first_name.x, ManagerName=first_name.y) %>% select(EmployeeName, ManagerName)