Викачка даних про сторінки і групи у Facebook

# ця бібліотека дозволяє підключатися до Facebook API
# інсталювати краще версію з гітхабу, вона актуальніша
library(devtools)
install_github("pablobarbera/Rfacebook/Rfacebook")
## Skipping install of 'Rfacebook' from a github remote, the SHA1 (c7cd323d) has not changed since last install.
##   Use `force = TRUE` to force installation
library(Rfacebook)
## Loading required package: httr
## Loading required package: rjson
## Loading required package: httpuv
## Warning: package 'httpuv' was built under R version 3.3.2
## 
## Attaching package: 'Rfacebook'
## The following object is masked from 'package:methods':
## 
##     getGroup
# бібліотека для різних маніпуляцій з даними
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.2
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'tibble' was built under R version 3.3.2
## Warning: package 'tidyr' was built under R version 3.3.2
## Warning: package 'readr' was built under R version 3.3.2
## Warning: package 'purrr' was built under R version 3.3.2
## Warning: package 'dplyr' was built under R version 3.3.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
# наші файли з даними груп
fb <- readxl::read_excel("fb_maidan.xlsx")

# токен отримуємо тут - https://developers.facebook.com/tools/explorer/
token <- "EAACEdEose0cBAOHNi9onKY8kyZCZBcNZAsI1ZBRJvMpc2yEKTPzcZB6GBCHEOsmqDUuY5oszZAGZAbFIewz0QbzsRrPgw9odVlkUHQi8E7KHIa6o0Ek6ubITlzAHbhhvXiOw8FtZAOPuwLQGMYMZBvZAGHd0zKzgySTLIuIsETvYVzrPmRJcJxUdnqxrzPE3YEiS0ZD"

Створимо тестові масиви.

#приклад групи
example_fb <- getGroup("202155769980250",token,n = 1000)
#члени групи
#залежить від налаштувань групи - може віддаватися інфа про всіх членів групи, а може - ні 
members <- getMembers("202155769980250",token,n = 10000000)
#приклад пабліку
example_fb_page <-getPage("Euromaydan.Odessa",token,n = 10000000000000000)

Подивимось на структуру отриманих масивів

glimpse(example_fb)
## Observations: 1,000
## Variables: 10
## $ from_id        <chr> "135146317033206", "706464436081662", "31182211...
## $ from_name      <chr> "Елизавета Сидорова", "Дмитрий Фортунатов", "Ar...
## $ message        <chr> "http://media-collider.com/tr/2017/07/17/turech...
## $ created_time   <chr> "2017-07-25T14:24:31+0000", "2017-07-25T13:55:0...
## $ type           <chr> "link", "link", "link", "video", "link", "link"...
## $ link           <chr> "http://media-collider.com/tr/2017/07/17/turech...
## $ id             <chr> "202155769980250_695391420656680", "20215576998...
## $ likes_count    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 3, 0, 0, 0, 2,...
## $ comments_count <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shares_count   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,...
glimpse(members)
## Observations: 1
## Variables: 5
## $ name          <chr> "Михаил Михайлов"
## $ first_name    <chr> "Михаил"
## $ last_name     <chr> "Михайлов"
## $ id            <chr> "237075823291013"
## $ administrator <lgl> FALSE
glimpse(example_fb_page)
## Observations: 1,347
## Variables: 10
## $ from_id        <chr> "736807679676151", "736807679676151", "73680767...
## $ from_name      <chr> "Евромайдан Одесса", "Евромайдан Одесса", "Евро...
## $ message        <chr> NA, "http://m.ostro.org/video/527833/", NA, NA,...
## $ created_time   <chr> "2017-07-01T16:08:57+0000", "2017-06-30T09:28:5...
## $ type           <chr> "link", "link", "photo", "link", "link", "link"...
## $ link           <chr> "http://www.megafon.od.ua/2017/07/01/v-odesskoi...
## $ id             <chr> "736807679676151_1696992750324301", "7368076796...
## $ likes_count    <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ comments_count <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shares_count   <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

Інфу про лайки, коментарі та поширення потрібно вантажити окремо

#пишу відразу як прописувати масову викачку
posts <- lapply(example_fb$id,function(x)getPost(x,token,1000000000000000))
likes <- do.call("rbind",lapply(1:length(posts), function(i) posts[[i]]$likes)) 
comments <- likes <- do.call("rbind",lapply(1:length(posts), function(i) posts[[i]]$comments)) 
#шери окремо читаються
shares <- do.call("rbind",lapply(example_fb$id,function(x)getShares(x,token,1000000000000000)))

Поглянемо на структуру цих масивів

glimpse(likes)
## Observations: 136
## Variables: 7
## $ from_id        <chr> "524386777696235", "829292903786912", "73716717...
## $ from_name      <chr> "Simon Ivanov", "Aleksandr Trotsyuk", "Олександ...
## $ message        <chr> "Всегда это \"чуть\"", "http://www.metacafe.com...
## $ created_time   <chr> "2017-07-25T14:10:43+0000", "2017-07-23T23:25:2...
## $ likes_count    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,...
## $ comments_count <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ id             <chr> "695385490657273", "694555190740303", "69315548...
glimpse(comments)
## Observations: 136
## Variables: 7
## $ from_id        <chr> "524386777696235", "829292903786912", "73716717...
## $ from_name      <chr> "Simon Ivanov", "Aleksandr Trotsyuk", "Олександ...
## $ message        <chr> "Всегда это \"чуть\"", "http://www.metacafe.com...
## $ created_time   <chr> "2017-07-25T14:10:43+0000", "2017-07-23T23:25:2...
## $ likes_count    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,...
## $ comments_count <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ id             <chr> "695385490657273", "694555190740303", "69315548...
glimpse(shares)
## Observations: 5
## Variables: 4
## $ from_id     <chr> "664422470424233", "664422470424233", "66442247042...
## $ from_name   <chr> "Сплетни Новогродовка", "Сплетни Новогродовка", "С...
## $ shared_time <lgl> NA, NA, NA, NA, NA
## $ id          <chr> "664422470424233_682928161906997", "66442247042423...

Робота в VK

З вк трохи складніше все, готової бібліотеки немає, тому спираємося на свої сили.

Функція для отримання постів зі сторінки

get_wall_posts <- function(id_min, id_max, id_step=100){
  extended <- paste0('extended=', 0)
  copy_depth <- paste0('copy_history_depth=', 1)
  id_lo=id_min;id_hi=id_min+id_step-1
  cat(id_min,'-',id_max,': ')
  while (id_lo < id_max) {
    cat(min(id_hi, id_max), '. ')
    posts_range <- id_lo:id_hi 
    posts <- paste0('posts=', paste0('-', group_id, '_', posts_range, 
                                     collapse=','))
    request <- paste('https://api.vk.com/method/wall.getById?v=4.9',
                     posts, extended, copy_depth, access_token, sep='&')
    posts_list <- fromJSON(getURL(request))
    if (id_lo == id_min) 
      df <- wall2df(posts_list$response)
    else 
      df <- rbind(df, wall2df(posts_list$response))
    if (id_hi < id_max) Sys.sleep(sleep_time)
    id_lo <- id_lo+id_step
    id_hi <- id_hi+id_step
  }
  df
}
wall2df <- function(wall){
  df <- data.frame(uid=rep(0, length(wall)))
  i <- 0
  for (wall_post in wall){
    i <- i + 1
    df$uid[i] <- wall_post$id 
    df$author[i] <- wall_post$from_id 
    df$whodidthis[i] <- ifelse(is.null(wall_post$created_by),
                               ifelse(is.null(wall_post$signer_id),
                                      NA, wall_post$signer_id),
                               wall_post$created_by) 
    df$type[i] <- wall_post$post_type
    df$comments[i] <- wall_post$comments[["count"]]
    df$likes[i] <- wall_post$likes[["count"]]
    df$reposts[i] <- wall_post$reposts[["count"]]
    df$date[i] <- wall_post$date 
    df$text[i] <- wall_post$text
  }
  df
}
group_id <- 33305945 # id групи Karlsberg
id_min <- 1
id_max <- 2
posts <- get_wall_posts(id_min, id_max)
get_likers_commenters <- function(posts){
  posts_likers_commenters <- list()
  cat('1-', dim(posts)[1], ': ', sep='')
  for (i in 1:dim(posts)[1]){
    request_likers <- paste0('https://api.vk.com/method/likes.getList?owner_id=-',
                             group_id, '&type=post&item_id=', posts$uid[i])
    likers <- fromJSON(getURL(request_likers))$response$users
    request_comments <- paste0('https://api.vk.com/method/wall.getComments?v=5.50&owner_id=-',
                               group_id, '&post_id=', posts$uid[i])
    comments <- fromJSON(getURL(request_comments))
    commenters <- c()
    comments_ids <- c()
    comments_likers <- c()
    if (comments$response$count){
      commenters <- sapply(comments$response$items, 
                           function(comment) comment$from_id)
      comments_ids <- sapply(comments$response$items, 
                             function(comment) comment$id)
      for (comment_id in comments_ids) {
        request_comments_likers <- paste0(
          'https://api.vk.com/method/likes.getList?owner_id=-',
          group_id, '&type=comment&item_id=', 
          comment_id)
        comments_likers = c(comments_likers, 
                            unlist(fromJSON(getURL(request_comments_likers))$response$users))
      }
    }
    posts_likers_commenters[[i]] <- list(likers = likers,
                                         commenters = commenters,
                                         comments_likers = comments_likers)
    if( i %% 25 == 0) cat(i, ' . ')
    if( i %% 200 == 0) Sys.sleep(10)
  }
  posts_likers_commenters
}
countries <- rename(countries, country_id=cid, country = name)
cities <- rename(cities, city_id=cid, city = name)
countries$country_id <- as.integer(countries$country_id)
cities$city_id <- as.integer(cities$city_id)
members <- left_join(members, cities, by = 'city_id')
members <- left_join(members, countries, by = 'country_id')
members$country[is.na(members$country)] <- 'не вказана'
members$city[is.na(members$city)] <- 'не вказане'
members$age <- floor(as.numeric(difftime(now(), members$bdate, units = 'days'))/365.25)
members$age[members$age > 100] <- NA

#Додамо все це у первинну таблицю даних
posts$likers <- sapply(posts_likers_commenters, function(plc) plc$likers)
posts$commenters <- sapply(posts_likers_commenters, function(plc) plc$commenters)
posts$comments_likers <- sapply(posts_likers_commenters, function(plc) plc$comments_likers)