In modern computers traversing a directory with 500K files is nothing. When you fs.readdir
asynchronously in Node.js, what it does is just read a list of file names in the specified directory. It doesn't read the files' contents. I've just tested with 700K files in the dir. It takes only 21MB of memory to load this list of file names.
Once you've loaded this list of file names, you just traverse them one by one or in parallel by setting some limit for concurrency and you can easily consume them all. Example:
var async = require('async'),
fs = require('fs'),
path = require('path'),
parentDir = '/home/user';
async.waterfall([
function (cb) {
fs.readdir(parentDir, cb);
},
function (files, cb) {
// `files` is just an array of file names, not full path.
// Consume 10 files in parallel.
async.eachLimit(files, 10, function (filename, done) {
var filePath = path.join(parentDir, filename);
// Do with this files whatever you want.
// Then don't forget to call `done()`.
done();
}, cb);
}
], function (err) {
err && console.trace(err);
console.log('Done');
});
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…