You may get some speedup by creating default dicts and copying them to your records. Here is sample code with some comments...
def update_users_genre_lang_score(cursor):
# you are asking for a lot of stuff but only using a little. Is this
# stuff consumed in this function?
cursor.execute("select user_id,playDuration,lang,genre from sd_archive_track_clicks where playDuration > 15 and user_id!=0 and genre!=0 and lang!=0 and lang <21 and genre <24 and playDate > '2016-10-01'order by playDate desc")
# what is the commit for?
# db.commit()
numrows = int(cursor.rowcount)
tracks_played= cursor.fetchall()
#print tracks_played
genre_list=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]
genre_default = {genre:0 for genre in genre_list}
lang_list=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
lang_default = {lang:0 for lang in lang_list}
#initialization part
user_genre_score = {}
user_lang_score = {}
for track in tracks_played:
user_id = track['user_id']
user_genre_score[user_id]=genre_default.copy()
user_lang_score[user_id]=lang_default.copy()
#initialization part end
# this seems like an expensive way to initialize to 1 instead of 0...
# am i missing something?!
for track in tracks_played:
user_genre_score[track['user_id']][track['genre']] += 1
user_lang_score[track['user_id']][track['lang']] += 1
UPDATE
You could initialize with collections.defaultdict
so that items are generated dynamically as you touch them. This saves you from revisiting the nodes for each time user_id appears in the rows.
import collections
def update_users_genre_lang_score(cursor):
cursor.execute("select user_id,playDuration,lang,genre from sd_archive_track_clicks where playDuration > 15 and user_id!=0 and genre!=0 and lang!=0 and lang <21 and genre <24 and playDate > '2016-10-01'order by playDate desc")
# what is the commit for?
# db.commit()
numrows = int(cursor.rowcount)
tracks_played= cursor.fetchall()
#print tracks_played
#initialization part
# this creates a two level nested dict ending in an integer count
# that generates items dynamically
user_genre_score = collections.defaultdict(lambda: collections.defaultdict(int))
user_lang_score = collections.defaultdict(lambda: collections.defaultdict(int))
#initialization part end
for track in tracks_played:
user_genre_score[track['user_id']][track['genre']] += 1
user_lang_score[track['user_id']][track['lang']] += 1
How It Works
defaultdict
can make your brain explode - fair warning. With dict
, accessing a non-existent key raises KeyError
. But with defaultdict
, it calls an initializer you supply and creates a key for you. When you call int()
you get a 0.
>>> int()
0
So if we make it the initializer, you get 0 when you first access a new key
>>> d1 = collections.defaultdict(int)
>>> d1
defaultdict(<class 'int'>, {})
>>> d1['user1']
0
>>> d1
defaultdict(<class 'int'>, {'user1': 0})
If you increment a new key, python first gets the item which does the initialization
>>> d1['user2'] += 1
>>> d1
defaultdict(<class 'int'>, {'user1': 0, 'user2': 1})
But you need two levels of dicts..., so have the outer one create inner defaultdict
>>> d2 = collections.defaultdict(lambda:collections.defaultdict(int))
>>> d2['user1']
defaultdict(<class 'int'>, {})
>>> d2['user1']['genre1']
0
>>> d2
defaultdict(<function <lambda> at 0x7efedf493bf8>, {'user1': defaultdict(<class 'int'>, {'genre1': 0})})
>>> d2['user1']['genre2'] += 1
>>> d2
defaultdict(<function <lambda> at 0x7efedf493bf8>, {'user1': defaultdict(<class 'int'>, {'genre1': 0, 'genre2': 1})})