I am webscraping reddit using a medium post. Reddit paginates its posts as you scroll down the page. My scraper only gets the posts which have been inserted in at the start, but as a scroll down a reddit sub, I get more and more posts which my scraper doesn't axis.
This is my code:
const request = require('request-promise');
const cheerio = require('cheerio');
const start = async () => {
const SUBREDIT = 'instagram';
const BASE_URL = `https://old.reddit.com/r/${SUBREDIT}/`;
let response = await request(
BASE_URL,
{
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,fr;q=0.8,ro;q=0.7,ru;q=0.6,la;q=0.5,pt;q=0.4,de;q=0.3',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
);
let $ = cheerio.load(response);
let posts = [];
$('#siteTable > .thing').each((i, elm) => {
let score = {
upvotes: $(elm).find('.score.unvoted').text().trim(),
likes: $(elm).find('.score.likes').text().trim(),
dislikes: $(elm).find('.score.dislikes').text().trim(),
}
let title = $(elm).find('.title').text().trim();
let comments = $(elm).find('.comments').text().trim();
let time = $(elm).find('.tagline > time').attr('title').trim();
let author = $(elm).find('.tagline > .author').text().trim();
posts.push({
title,
comments,
score,
time,
author
});
})
console.log(posts);
}
start();
This pulls up about 30 results. I am looking for a few hundred... how can I go about getting more?
question from:
https://stackoverflow.com/questions/65922731/node-webscraping-get-more-data-from-reddit 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…