This python code gets the user timeline, likes, posts, and friends (i.e., following) from Twitter raw data obtained from the Twitter API v2. The code then saves the data to a CSV file. This code works with Twitter data in multiple languages.
"""
@author: rb5286
"""
import json
import pandas as pd
import glob
from tqdm import tqdm
def readFiles(lang):
outdir='/Users/rb5286/Downloads/twitter3/'
files =glob.glob(f'/Users/rb5286/Downloads/twitter3/{lang}/*json*')
timeline_col=['text', 'is_quote_status', 'retweet_count', 'favorite_count',
'lang', 'entities.hashtags', 'entities.urls', 'user.screen_name',
'user.location', 'user.followers_count'] # 'place'
posts_likes_col=['text', 'is_quote_status', 'retweet_count', 'favorite_count',
'lang', 'entities.hashtags', 'entities.urls', 'user.screen_name',
'user.location', 'user.followers_count'] # 'place'
friends_col=['id', 'screen_name', 'location']
likes_df = pd.DataFrame()
posts_df = pd.DataFrame()
timeline_df=pd.DataFrame()
friends_df=pd.DataFrame()
for file in tqdm(files[:]):
with open(file,'r') as f:
data = json.loads(f.read())
user_id=file.split('/')[-1].split('__')[0]
timeline = pd.json_normalize(data, record_path =['home-timeline'])
if timeline.empty:
print(f'{user_id} HOME TIMELINE is empty')
continue
posts = pd.json_normalize(data, record_path =['user-timelines'])
if posts.empty:
print(f'{user_id} POSTS are empty')
continue
likes = pd.json_normalize(data, record_path =['favorites'])
if likes.empty:
print(f'{user_id} LIKES are empty')
continue
friends = pd.json_normalize(data, record_path=['friends'])
if friends.empty:
print(f'{user_id} FRIENDS are empty')
continue
#print(user_id)
timeline=timeline[timeline_col];
timeline['userid']=user_id
posts=posts[posts_likes_col];
#if 'place' not in posts.columns: continue
posts['userid']=user_id
likes=likes[posts_likes_col];
#if 'place' not in likes.columns: continue
likes['userid']=user_id
friends=friends[friends_col];
friends['userid']=user_id
timeline_df = pd.concat([timeline_df, timeline], ignore_index=True)
posts_df = pd.concat([posts_df, posts], ignore_index=True)
likes_df = pd.concat([likes_df, likes], ignore_index=True)
friends_df = pd.concat([friends_df, friends], ignore_index=True)
timeline_df.to_csv(f'{outdir}Twitter_{lang}_timeline.csv', index=False)
posts_df.to_csv(f'{outdir}Twitter_{lang}_posts.csv', index=False)
likes_df.to_csv(f'{outdir}Twitter_{lang}_likes.csv', index=False)
friends_df.to_csv(f'{outdir}Twitter_{lang}_friends.csv', index=False)
print(len(timeline_df), len(posts_df), len(likes_df), len(friends_df))
### For printing colnames
list(timeline.columns.values)
list(posts.columns.values)
list(likes.columns.values)
Running the function defined above
for lan in ['en', 'sp']:
readFiles(lan)