I am currently using a Steam-crawler (https://github.com/aesuli/steam-crawler) script that, based on a dataset with game IDs, is able to scrap Steam reviews data (date, text of the review, ID of the user, etc...). I am not an expert in HTML scraping, but from what I understood from the code (that is below), the script is looping for a given game ID to collect all reviews, until it encounters this specific situation endre = re.compile(r'({"success":2})|(no_more_reviews)')
.
import argparse
import csv
import os
import re
import socket
import string
import urllib
import urllib.request
import urllib.parse
import json
from contextlib import closing
from time import sleep
def download_page(url, maxretries, timeout, pause):
tries = 0
htmlpage = None
while tries < maxretries and htmlpage is None:
try:
with closing(urllib.request.urlopen(url, timeout=timeout)) as f:
htmlpage = f.read()
sleep(pause)
except (urllib.error.URLError, socket.timeout, socket.error):
tries += 1
return htmlpage
def getgameids(filename):
ids = set()
with open(filename, encoding='utf8') as f:
reader = csv.reader(f)
for row in reader:
dir = row[0] #à la base c'est 0
id_ = row[1]
name = row[2]
ids.add((dir, id_, name))
return ids
def getgamereviews(ids, timeout, maxretries, pause, out):
urltemplate = string.Template(
'https://store.steampowered.com//appreviews/$id?cursor=$cursor&filter=recent&language=english')
endre = re.compile(r'({"success":2})|(no_more_reviews)')
for (dir, id_, name) in ids:
if dir == 'sub':
print('skipping sub %s %s' % (id_, name))
continue
gamedir = os.path.join(out, 'pages', 'reviews', '-'.join((dir, id_)))
donefilename = os.path.join(gamedir, 'reviews-done.txt') #When all reviews of a given have been extracted
if not os.path.exists(gamedir): #Create a folder if not existing
os.makedirs(gamedir)
elif os.path.exists(donefilename): #if folder exists, skip game
print('skipping app %s %s' % (id_, name))
continue
print(dir, id_, name)
cursor = '*'
offset = 0
page = 1
maxError = 10
errorCount = 0
i = 0
while True:
url = urltemplate.substitute({'id': id_, 'cursor': cursor})
print(offset, url)
htmlpage = download_page(url, maxretries, timeout, pause)
if htmlpage is None:
print('Error downloading the URL: ' + url)
sleep(pause * 3)
errorCount += 1
if errorCount >= maxError:
print('Max error!')
break
else:
with open(os.path.join(gamedir, 'reviews-%s.html' % page), 'w', encoding='utf-8') as f:
htmlpage = htmlpage.decode()
if endre.search(htmlpage):
break
f.write(htmlpage)
page = page + 1
parsed_json = (json.loads(htmlpage))
cursor = urllib.parse.quote(parsed_json['cursor'])
with open(donefilename, 'w', encoding='utf-8') as f:
pass
def main():
parser = argparse.ArgumentParser(description='Crawler of Steam reviews')
parser.add_argument('-f', '--force', help='Force download even if already successfully downloaded', required=False,
action='store_true')
parser.add_argument(
'-t', '--timeout', help='Timeout in seconds for http connections. Default: 180',
required=False, type=int, default=180)
parser.add_argument(
'-r', '--maxretries', help='Max retries to download a file. Default: 5',
required=False, type=int, default=3)
parser.add_argument(
'-p', '--pause', help='Seconds to wait between http requests. Default: 0.5', required=False, default=0.01,
type=float)
parser.add_argument(
'-m', '--maxreviews', help='Maximum number of reviews per item to download. Default:unlimited', required=False,
type=int, default=5000000)
parser.add_argument(
'-o', '--out', help='Output base path', required=False, default='data')
parser.add_argument(
'-i', '--ids', help='File with game ids', required=False, default='./data/games.csv')
args = parser.parse_args()
if not os.path.exists(args.out):
os.makedirs(args.out)
ids = getgameids(args.ids)
print('%s games' % len(ids))
getgamereviews(ids, args.timeout, args.maxretries, args.pause, args.out)
if __name__ == '__main__':
main()
The issue that I am currently facing is that the script is not properly extracting correctly the reviews: for instance, for a game such as Counter-Strike Global Offensive that has about ~1,000,000 reviews, the script will sometimes return either 4000 pages of reviews (each html page contains 20 reviews), 6000 or 500 until it stops!
What I imagined as a solution was perhaps to save each URL that the script tested, and repeat the loop 10 times each time endre = re.compile(r'({"success":2})|(no_more_reviews)')
is True, and to skip URL that have already been collected, however I'm not sure it would actually work?
I would also raise the issue on the GitHub page, but the author does not seem to often respond, and I am really interested on why it is happening, and if it is possible to solve this issue. Thank you in advance.
EDIT: So I looked a bit on the Steam API documentation https://partner.steamgames.com/doc/store/getreviews and it seems that each page gives a cursor as to be able to load the next page. So why is it randomly changing?