本程序可下载页面所依赖的CSS/JS,图片等引用,目前不包含下载关联HTML页面这个功能。代码如下
/*
*****************************************************************************
* This software is under the Apache License Version 2.0
* Author: Tao - mail:cn.java.river@gmail.com
* Spreading Your Heart
****************************************************************************
*/

package atao.util.html;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;

import org.apache.commons.lang.StringUtils;

/**
*
* A Simple HTML downloader which can also download Page resources.
* <br/>
* <b>Note: This Tool won't download related or sub HTML</b>
*
* @author <a href="mailto:cn.java.river@gmail.com">Tao</a>
* @since 1.0
*/
public class HtmlDownloader
{

//URL will be downloaded.
private static String url = "http://pervasive2.morselli.unimo.it/~nicola/courses/IngegneriaDelSoftware/java/J6d_xml.html";

//workspace folder.
private static String workspace = "download";

//sub css and js resources sign
private static String urlSign = "<link href=";

//sub image resources sign
private static String urlSign2 = "src=";

//URL parent.
private static String rootUrl = null;

public static void main (String[] args) throws Exception
{
long start = System.nanoTime ();
setRootUrl ();
URL u = new URL (url);
InputStream is = u.openStream ();
BufferedReader reader = new BufferedReader (new InputStreamReader (is));
File f = createDownloadFile ("download.html");
BufferedWriter writer = new BufferedWriter (new FileWriter (f));
String s;
while ((s = reader.readLine ()) != null)
{
writer.write (s);
writer.newLine ();
if (hasSubUrl (s))
{
downloadChild (getSubUrl (s));
}
}
is.close ();
reader.close ();
writer.close ();
System.out.println ("Download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00));
} // end of main

/**
* set root url for the downloading html
*/
private static void setRootUrl ()
{
int pos = url.lastIndexOf ("/");
rootUrl = url.substring (0, pos);
System.out.println ("Root Url is:" + rootUrl);
}

/**
* check if content includes sub resources.
*
* @param text line of html content.
* @return Yes or Not
*/
private static boolean hasSubUrl (String text)
{
if (StringUtils.isNotEmpty (text))
{
if (text.contains (urlSign) || text.contains (urlSign2))
{
return true;
}
return false;
}
else
{
return false;

}

}

/**
* generate sub url from line content.
*/
private static String getSubUrl (String text)
{
int pos = text.indexOf (urlSign);
pos = (pos == -1) ? text.indexOf (urlSign2) : pos;
text = text.substring (pos);
String[] ps = text.split ("\"");
System.out.println ("subUrl is :" + ps[1]);
return ps[1];
}

/**
* download sub resources,<b>Note: don't use Java Character Writers,
* otherwise you can't get pictures correctly.</b>
*
* @param subUrl
*/
private static void downloadChild (String subUrl)
{
if (StringUtils.isNotEmpty (subUrl))
{
if (subUrl.startsWith ("http:"))
{
System.out.println ("subUrl not support yet.");
}
else
{
long start = System.nanoTime ();
try
{
String forUrl = subUrl.replace (" ", "%20");
if (!forUrl.startsWith ("/"))
{
forUrl = "/" + forUrl;
}
URL u = new URL (rootUrl + forUrl);
InputStream reader = u.openStream ();
File f = createDownloadFile (subUrl);
FileOutputStream writer = new FileOutputStream (f);
byte[] buff = new byte[1024];
int size = -1;
while ((size = reader.read (buff)) != -1)
{
writer.write (buff, 0, size);

}
reader.close ();
writer.close ();
}
catch (Exception e)
{
e.printStackTrace ();
}
System.out.println ("Source:" + subUrl +"download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00));
}
}
else
{
System.out.println ("subUrl is Empty.");

}
}

/**
* create sub file,create parent folders if necessary.
*
* @param url related path of a url source.
* @return created file.
*/
private static File createDownloadFile (String url)
{
File f = new File (workspace, url);
f.getParentFile ().mkdirs ();
return f;
}

}

更多相关文章

  1. HTML5 标签audio添加网页背景音乐代码
  2. IE中页面不居中,火狐谷歌等正常
  3. 如何使html页面中的文本变为可编辑的?
  4. 我无法理解为什么我的代码中的单击选择文本
  5. 当锚标记被单击时,角值从一个页面传递到另一个页面
  6. HTML5实现一个可编辑的模板页面
  7. 使用jsPDF生成一个保存HTML页面样式的pdf。
  8. jQuery .load停止嵌入页面/重新加载整个页面的视频
  9. js字符串与html代码互相转换时怪想法:自己解析js字符串成普通字

随机推荐

  1. JS(JavaScript)的j进一步了解9(更新中··
  2. angular $ http承诺被退回两次
  3. 求助 急 js替换字符串问题
  4. 开心菜鸟系列----变量的解读(javascript入门篇
  5. eval是邪恶的,但它有缺陷吗?(复制)
  6. 访问D3的var格式数据
  7. 【JavaScript】JavaScript的对象-对象专
  8. Javascript通过Ajax与C#约会
  9. 在javascript中识别/获取选定的上下文菜
  10. 如何将window.location设置为特定路径(没