Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
203 views
in Technique[技术] by (71.8m points)

javascript - How to save the current webpage with casperjs/phantomjs?

Is there a way to save the current webpage by using casperjs or phantomjs? I tried to get the html and save it into a file. But the resulting file was a lot different from the screenshot of that time (with casper.capture). Is there a way to save the current webpage?

See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Reply

0 votes
by (71.8m points)

Andrey Borisko suggested to use the disk cache to retrieve the resources. My solution is not that efficient, but you don't need to decompress text files.

I use XMLHttpRequest to retrieve all resources after I registered them with the resource.received event handler. I then filter the resources into images, css and fonts. The current limitation is that remote resource paths that contain something like ../ or ./ are not handled correctly.

I retrieve the current page content with getHTML and iterate over all captured resources to replace the path used in the markup, that is identified by a portion of the complete resource URL, with a randomly generated file name. The file extension is created from the content type of the resource. It is converted using mimeType from this gist.

Since CSS files may contain background images or fonts, they have to be processed before saving to disk. The provided loadResource function loads the resource, but does not save it.

Since XMLHttpRequest to download the resources the script has to be invoked with the --web-security=false flag:

casperjs script.js --web-security=false

script.js

var casper = require("casper").create();
var utils = require('utils');
var fs = require('fs');
var mimetype = require('./mimetype'); // URL provided below
var cssResources = [];
var imgResources = [];
var fontResources = [];
var resourceDirectory = "resources";
var debug = false;

fs.removeTree(resourceDirectory);

casper.on("remote.message", function(msg){
    this.echo("remote.msg: " + msg);
});

casper.on("resource.error", function(resourceError){
    this.echo("res.err: " + JSON.stringify(resourceError));
});

casper.on("page.error", function(pageError){
    this.echo("page.err: " + JSON.stringify(pageError));
});

casper.on("downloaded.file", function(targetPath){
    if (debug) this.echo("dl.file: " + targetPath);
});

casper.on("resource.received", function(resource){
    // don't try to download data:* URI and only use stage == "end"
    if (resource.url.indexOf("data:") != 0 && resource.stage == "end") {
        if (resource.contentType == "text/css") {
            cssResources.push({obj: resource, file: false});
        }
        if (resource.contentType.indexOf("image/") == 0) {
            imgResources.push({obj: resource, file: false});
        }
        if (resource.contentType.indexOf("application/x-font-") == 0) {
            fontResources.push({obj: resource, file: false});
        }
    }
});

// based on http://docs.casperjs.org/en/latest/modules/casper.html#download
casper.loadResource = function loadResource(url, method, data) {
    "use strict";
    this.checkStarted();
    var cu = require('clientutils').create(utils.mergeObjects({}, this.options));
    return cu.decode(this.base64encode(url, method, data));
};


function escapeRegExp(string) {
    // from https://stackoverflow.com/a/1144788/1816580
    return string.replace(/([.*+?^=!:${}()|[]/\])/g, "\$1");
}

function replaceAll(find, replace, str) {
    // from https://stackoverflow.com/a/1144788/1816580
    return str.replace(find, replace);
}

var wrapFunctions = [
    function wrapQuot1(s){
        return '"' + s + '"';
    },
    function wrapQuot2(s){
        return "'" + s + "'";
    },
    function csswrap(s){
        return '(' + s + ')';
    }
];

function findAndReplace(doc, resources, resourcesReplacer) {
    // change page on the fly
    resources.forEach(function(resource){
        var url = resource.obj.url;

        // don't download again
        if (!resource.file) {
            // set random filename and download it **or** call further processing which in turn will load ans write to disk
            resource.file = resourceDirectory+"/"+Math.random().toString(36).slice(2)+"."+mimetype.ext[resource.obj.contentType];
            if (typeof resourcesReplacer != "function") {
                if (debug) casper.echo("download resource (" + resource.obj.contentType + "): " + url + " to " + resource.file);
                casper.download(url, resource.file, "GET");
            } else {
                resourcesReplacer(resource);
            }
        }

        wrapFunctions.forEach(function(wrap){
            // test the resource url (growing from the back) with a string in the document
            var lastURL;
            var lastRegExp;
            var subURL;
            // min length is 4 characters
            for(var i = 0; i < url.length-5; i++) {
                subURL = url.substring(i);
                lastRegExp = new RegExp(escapeRegExp(wrap(subURL)), "g");
                if (doc.match(lastRegExp)) {
                    lastURL = subURL;
                    break;
                }
            }
            if (lastURL) {
                if (debug) casper.echo("replace " + lastURL + " with " + resource.file);
                doc = replaceAll(lastRegExp, wrap(resource.file), doc);
            }
        });
    });
    return doc;
}

function capturePage(){

    // remove all <script> and <base> tags
    this.evaluate(function(){
        Array.prototype.forEach.call(document.querySelectorAll("script"), function(scr){
            scr.parentNode.removeChild(scr);
        });
        Array.prototype.forEach.call(document.querySelectorAll("base"), function(scr){
            scr.parentNode.removeChild(scr);
        });
    });

    // TODO: remove all event handlers in html

    var page = this.getHTML();
    page = findAndReplace(page, imgResources);
    page = findAndReplace(page, cssResources, function(cssResource){
        var css = casper.loadResource(cssResource.obj.url, "GET");
        css = findAndReplace(css, imgResources);
        css = findAndReplace(css, fontResources);
        fs.write(cssResource.file, css, "wb");
    });
    fs.write("page.html", page, "wb");
}

casper.start("http://www.themarysue.com/").wait(3000).then(capturePage).run(function(){
    this.echo("DONE");
    this.exit();
});

The magic happens in findAndReplace. capturePage is completely synchronous so it can be dropped anywhere without much head ache.

URL for mimetype.js


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
OGeek|极客中国-欢迎来到极客的世界,一个免费开放的程序员编程交流平台!开放,进步,分享!让技术改变生活,让极客改变未来! Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...