小程序：Java下载单页HTML（可下载引用资源）

本程序可下载页面所依赖的CSS/JS,图片等引用，目前不包含下载关联HTML页面这个功能。代码如下
/*
 *****************************************************************************
 * This software is under the Apache License Version 2.0
 * Author: Tao -  mail:cn.java.river@gmail.com
 * Spreading Your Heart
 ****************************************************************************
 */

package atao.util.html;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;

import org.apache.commons.lang.StringUtils;

/**
 * 
 * A Simple HTML downloader which can also download Page resources.
 * <br/>
 * <b>Note: This Tool won't download related or sub HTML</b>
 * 
 * @author <a href="mailto:cn.java.river@gmail.com">Tao</a>
 * @since 1.0
 */
public class HtmlDownloader
{

    //URL will be downloaded.
    private static String url = "http://pervasive2.morselli.unimo.it/~nicola/courses/IngegneriaDelSoftware/java/J6d_xml.html";
    
    //workspace folder.
    private static String workspace = "download";
    
    //sub css and js resources sign 
    private static String urlSign = "<link href=";
    
    //sub image resources sign
    private static String urlSign2 = "src=";
    
    //URL parent.
    private static String rootUrl = null;

    public static void main (String[] args) throws Exception
    {
        long start = System.nanoTime ();
        setRootUrl ();
        URL u = new URL (url);
        InputStream is = u.openStream ();
        BufferedReader reader = new BufferedReader (new InputStreamReader (is));
        File f = createDownloadFile ("download.html");
        BufferedWriter writer = new BufferedWriter (new FileWriter (f));
        String s;
        while ((s = reader.readLine ()) != null)
        {
            writer.write (s);
            writer.newLine ();
            if (hasSubUrl (s))
            {
                downloadChild (getSubUrl (s));
            }
        }
        is.close ();
        reader.close ();
        writer.close ();
        System.out.println ("Download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00));
    } // end of main

    /**
     * set root url for the downloading html
     */
    private static void setRootUrl ()
    {
        int pos = url.lastIndexOf ("/");
        rootUrl = url.substring (0, pos);
        System.out.println ("Root Url is:" + rootUrl);
    }

    /**
     * check if content includes sub resources.
     * 
     * @param text line of html content.
     * @return Yes or Not
     */
    private static boolean hasSubUrl (String text)
    {
        if (StringUtils.isNotEmpty (text))
        {
            if (text.contains (urlSign) || text.contains (urlSign2))
            {
                return true;
            }
            return false;
        }
        else
        {
            return false;

        }

    }

    /**
     * generate sub url from line content.
     */
    private static String getSubUrl (String text)
    {
        int pos = text.indexOf (urlSign);
        pos = (pos == -1) ? text.indexOf (urlSign2) : pos;
        text = text.substring (pos);
        String[] ps = text.split ("\"");
        System.out.println ("subUrl is :" + ps[1]);
        return ps[1];
    }

    /**
     * download sub resources,<b>Note: don't use Java Character Writers,
     * otherwise you can't get pictures correctly.</b>
     * 
     * @param subUrl
     */
    private static void downloadChild (String subUrl)
    {
        if (StringUtils.isNotEmpty (subUrl))
        {
            if (subUrl.startsWith ("http:"))
            {
                System.out.println ("subUrl not support yet.");
            }
            else
            {
                long start = System.nanoTime ();
                try
                {
                    String forUrl = subUrl.replace (" ", "%20");
                    if (!forUrl.startsWith ("/"))
                    {
                        forUrl = "/" + forUrl;
                    }
                    URL u = new URL (rootUrl + forUrl);
                    InputStream reader = u.openStream ();
                    File f = createDownloadFile (subUrl);
                    FileOutputStream writer = new FileOutputStream (f);
                    byte[] buff = new byte[1024];
                    int size = -1;
                    while ((size = reader.read (buff)) != -1)
                    {
                        writer.write (buff, 0, size);

                    }
                    reader.close ();
                    writer.close ();
                }
                catch (Exception e)
                {
                    e.printStackTrace ();
                }
                System.out.println ("Source:" + subUrl +"download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00));
            }
        }
        else
        {
            System.out.println ("subUrl is Empty.");

        }
    }

    /**
     * create sub file,create parent folders if necessary.
     * 
     * @param url related path of a url source.
     * @return created file.
     */
    private static File createDownloadFile (String url)
    {
        File f = new File (workspace, url);
        f.getParentFile ().mkdirs ();
        return f;
    }

}
更多相关文章

随机推荐