I am downloading more than twitters rate cap using a loop; however, when I try to append the list it returns an empty dataframe.
My function looks like:
IN:
import pandas as pd
import numpy as np
import tweepy
from datetime import timedelta
def get_tweets(handle):
batch_count_for_tweet_downloads = 200
try:
alltweets = []
tweets = api_twitter.user_timeline(screen_name=handle,
count=batch_count_for_tweet_downloads,
exclude_replies=True,
include_rts=False,
lang="en",
tweet_mode="extended")
# ---GET MORE THAN 200 TWEETS
alltweets.extend(tweets)
oldest = alltweets[-1].id - 1
oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
while len(tweets) > 0:
tweets = api_twitter.user_timeline(screen_name=handle, count=batch_count_for_tweet_downloads, max_id=oldest)
alltweets.extend(tweets)
oldest = alltweets[-1].id - 1
print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
#---
df = pd.DataFrame(data=[tweets.user.screen_name for tweets in alltweets], columns=['Handle'])
df['Tweets'] = np.array([tweets.full_text for tweets in alltweets])
df['Date'] = np.array([tweets.created_at - timedelta(hours=4) for tweets in alltweets])
df['Len'] = np.array([len(tweets.full_text) for tweets in alltweets])
df['Like_count'] = np.array([tweets.favorite_count for tweets in alltweets])
df['RT_count'] = np.array([tweets.retweet_count for tweets in alltweets])
total_tweets.extend(alltweets)
print(handle + " Total Tweets Extracted: {}".format(len(alltweets)))
except:
pass
return df
As you can see I need some help merging the loop into the function.
What is the best way of doing this?
Thank you for your help in advance.
EDIT 1: (What my code looks like now)
IN:
import tweepy
import pandas as pd
import numpy as np
from datetime import timedelta
handles = ['@MrML16419203', '@d00tn00t']
consumerKey = 'x'
consumerSecret = 'x'
accessToken = 'x'
accessTokenSecret = 'x'
authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)
authenticate.set_access_token(accessToken, accessTokenSecret)
api_twitter = tweepy.API(authenticate, wait_on_rate_limit=True)
total_tweets = []
def get_tweets(handle):
batch_count_for_tweet_downloads = 200
try:
alltweets = []
tweets = api_twitter.user_timeline(screen_name=handle,
count=batch_count_for_tweet_downloads,
exclude_replies=True,
include_rts=False,
lang="en",
tweet_mode="extended")
alltweets.extend(tweets)
oldest = alltweets[-1].id - 1
oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
while len(tweets) > 0:
tweets = api_twitter.user_timeline(screen_name=handle, count=batch_count_for_tweet_downloads, max_id=oldest)
alltweets.extend(tweets)
if len(alltweets) > 0:
oldest = alltweets[-1].id - 1
else:
pass
print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
print('---Total Downloaded: ' + str(len(alltweets)) + ' for ' + handle + '---')
df = pd.DataFrame(data=[tweets.user.screen_name for tweets in alltweets], columns=['Handle'])
df['Tweets'] = np.array([tweets.full_text for tweets in alltweets])
df['Date'] = np.array([tweets.created_at - timedelta(hours=4) for tweets in alltweets])
df['Len'] = np.array([len(tweets.full_text) for tweets in alltweets])
df['Like_count'] = np.array([tweets.favorite_count for tweets in alltweets])
df['RT_count'] = np.array([tweets.retweet_count for tweets in alltweets])
print([tweets.favorite_count for tweets in alltweets])
print(np.array([tweets.favorite_count for tweets in alltweets]))
total_tweets.extend(alltweets)
print("----------Total Tweets Extracted: {}".format(df.shape[0]) + "----------")
except:
pass
return df
df = pd.DataFrame()
for handle in handles:
df_new = get_tweets(handle)
df = pd.concat((df, df_new))
print(df)
OUT:
Getting Tweets For @MrML16419203, After: 2011-03-19 07:03:53
Count: ...136 @MrML16419203 Tweets Downloaded
---Total Downloaded: 136 for @MrML16419203---
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
----------Total Tweets Extracted: 136----------
Getting Tweets For @d00tn00t, After: 2009-11-27 19:18:58
Count: ...338 @d00tn00t Tweets Downloaded
Count: ...530 @d00tn00t Tweets Downloaded
Count: ...546 @d00tn00t Tweets Downloaded
Count: ...546 @d00tn00t Tweets Downloaded
---Total Downloaded: 546 for @d00tn00t---
Handle Tweets Date Len Like_count RT_count
0 MrML16419203 132716 2020-09-02 02:18:28 6.0 0.0 0.0
1 MrML16419203 432881 2020-09-02 02:04:23 6.0 0.0 0.0
2 MrML16419203 973625 2020-09-02 02:04:09 6.0 0.0 0.0
3 MrML16419203 1234567 2020-09-02 01:55:10 7.0 0.0 0.0
4 MrML16419203 225865 2020-09-02 01:27:11 6.0 0.0 0.0
.. ... ... ... ... ... ...
541 d00tn00t NaN NaT NaN NaN NaN
542 d00tn00t NaN NaT NaN NaN NaN
543 d00tn00t NaN NaT NaN NaN NaN
544 d00tn00t NaN NaT NaN NaN NaN
545 d00tn00t NaN NaT NaN NaN NaN
[682 rows x 6 columns]
As you can see for handles which have less than 200 tweets the dataframe gets populated. However, not for handles which contain more than 200 tweets.
See Question&Answers more detail:
os