This python code pulls the user posts and page likes from facebook raw data extracted from the Facebook Graph API. The code then saves the data to a CSV file.

"""
@author: Robert Vidigal, PhD (CSMaP-NYU)
"""
import pandas as pd
import glob
from tqdm import tqdm 

# Reading JSON files and converting to CSV
def readFiles(lang):
    outdir='/Users/rb5286/Downloads/facebook/'
    files =glob.glob(f'/Users/rb5286/Downloads/facebook/2022/{lang}/*json*')
    columns = ['message', 'shares', 'like.summary.total_count', 'love.summary.total_count', 'wow.summary.total_count',
           'haha.summary.total_count','sad.summary.total_count','angry.summary.total_count']
    likes_df = pd.DataFrame()
    posts_df = pd.DataFrame()
    
    for file in tqdm(files[:]):
        try:
            user_id=file.split('/')[-1].split('__')[0]
            df = pd.read_json(file,lines=True)
            likes = pd.read_json(df['likes'].tolist()[0],lines=True)
            if likes.empty or 'id' not in likes.columns: 
                print(f'{user_id} LIKES are empty')
                continue
            likes = likes[['name', 'id', 'is_media']]
            likes['userid']=user_id
            
            posts = pd.read_json(df['posts'].tolist()[0],lines=True)
            if posts.empty or 'like.summary.total_count' not in posts.columns: 
                print(f'{user_id} POSTS are empty')
                continue
            if 'message' not in posts.columns or 'shares' not in posts.columns or posts.empty:
                posts['message'] = None
                posts['shares'] = None
                posts = posts[columns]
                posts['userid']=user_id
            else: 
                posts = posts[columns]
                posts['userid']=user_id
            
        except Exception as e: 
            print(f'{file} has error {e}')
            continue
        
        likes_df = pd.concat([likes_df, likes], ignore_index=True)
        posts_df = pd.concat([posts_df, posts], ignore_index=True)
        

    likes_df.to_csv(f'{outdir}CSIP_Wave_2_facebook_{lang}_likes.csv',index=False)
    posts_df.to_csv(f'{outdir}CSIP_Wave_2_facebook_{lang}_posts.csv',index=False)
    print(len(likes_df), len(posts_df))

Running the defined function above.

for lan in ['en', 'sp']:
    readFiles(lan)