This python code pulls the user posts and page likes from facebook raw data extracted from the Facebook Graph API. The code then saves the data to a CSV file.
"""
@author: Robert Vidigal, PhD (CSMaP-NYU)
"""
import pandas as pd
import glob
from tqdm import tqdm
# Reading JSON files and converting to CSV
def readFiles(lang):
outdir='/Users/rb5286/Downloads/facebook/'
files =glob.glob(f'/Users/rb5286/Downloads/facebook/2022/{lang}/*json*')
columns = ['message', 'shares', 'like.summary.total_count', 'love.summary.total_count', 'wow.summary.total_count',
'haha.summary.total_count','sad.summary.total_count','angry.summary.total_count']
likes_df = pd.DataFrame()
posts_df = pd.DataFrame()
for file in tqdm(files[:]):
try:
user_id=file.split('/')[-1].split('__')[0]
df = pd.read_json(file,lines=True)
likes = pd.read_json(df['likes'].tolist()[0],lines=True)
if likes.empty or 'id' not in likes.columns:
print(f'{user_id} LIKES are empty')
continue
likes = likes[['name', 'id', 'is_media']]
likes['userid']=user_id
posts = pd.read_json(df['posts'].tolist()[0],lines=True)
if posts.empty or 'like.summary.total_count' not in posts.columns:
print(f'{user_id} POSTS are empty')
continue
if 'message' not in posts.columns or 'shares' not in posts.columns or posts.empty:
posts['message'] = None
posts['shares'] = None
posts = posts[columns]
posts['userid']=user_id
else:
posts = posts[columns]
posts['userid']=user_id
except Exception as e:
print(f'{file} has error {e}')
continue
likes_df = pd.concat([likes_df, likes], ignore_index=True)
posts_df = pd.concat([posts_df, posts], ignore_index=True)
likes_df.to_csv(f'{outdir}CSIP_Wave_2_facebook_{lang}_likes.csv',index=False)
posts_df.to_csv(f'{outdir}CSIP_Wave_2_facebook_{lang}_posts.csv',index=False)
print(len(likes_df), len(posts_df))
Running the defined function above.
for lan in ['en', 'sp']:
readFiles(lan)