Example

Fetch theWikipediahomepage, parse it to a DOM, and select the headlines from theInthenewssection into a list ofElements(online sample):

Document doc = Jsoup.connect("http://en.wikipedia.org/").get();
Elements newsHeadlines = doc.select("#mp-itn b a");

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>

public class test {

public List<String> analysePage(String url, int startPage, int endpage) throws Exception {
int endPage = 0;

List<String> links = new ArrayList<String>();
try
{
if (startPage<=1) {
url = "http://land.fang.com/market/________1_0_1.html";
}else {
url = "http://land.fang.com/market/________1_0_"+startPage+".html";
}
// 通过过滤器过滤出<A>标签
Parser parser = new Parser(url);
NodeList nodeList = parser
.extractAllNodesThatMatch(new NodeFilter()
{
// 实现该方法,用以过滤标签
public boolean accept(Node node)
{
if (node instanceof LinkTag)// 标记
return true;
return false;
}

});
// 打印

String tempPage ="";
for (int i = 97; i < nodeList.size(); i++)
{
LinkTag n = (LinkTag) nodeList.elementAt(i);
// System.out.print(n.getStringText() + " ==>> "+n.extractLink().length()+"=="+i+"==");


if(n.extractLink().length()==69&&n.extractLink().contains("http://land.fang.com/market/")){
links.add(n.extractLink());
System.out.println(n.extractLink());
}
String title = n.getStringText();

if(isNumeric(title)){
endPage = Integer.parseInt(title)+1;

}

if(isNumeric(tempPage)&&!isNumeric(title)){
break;
}
tempPage = title;

}

//System.out.print(endPage+"--2222--"+links.size());
}
catch (Exception e)
{
e.printStackTrace();
}
if (startPage < endpage&& endpage<=endPage) {
links.addAll(analysePage(url, startPage + 1, endpage));
}
for (int i=0;i<links.size();i++){
getData(links.get(i));
}
return links;
}

public static void getData(String introUrl){
try {
Document doc = Jsoup.connect(introUrl).get();
Elements newsHeadlines = doc.getElementsByClass("tablebox02 mt10");
Elements bianhao = doc.getElementsByClass("menubox01 mt20");
System.out.println(getSplitValue(bianhao.get(0).getElementsByTag("span").text(),":",1));

Element element = newsHeadlines.get(0).child(0);

System.out.println(element.child(0).child(0).child(1).text()); //地区
System.out.println(element.child(0).child(1).child(1).text()); //所在地
System.out.println(element.child(1).child(0).child(1).text()); //总面积
System.out.println(element.child(1).child(1).child(1).text()); // 建设用地面积
System.out.println(element.child(2).child(0).child(1).text()); //规划建筑面积
System.out.println(element.child(2).child(1).child(1).text()); //代征面积
System.out.println(getSplitValue(element.child(3).child(0).text(),":",1)); //容积率
System.out.println(getSplitValue(element.child(3).child(1).text(),":",1)); //绿化率
System.out.println(getSplitValue(element.child(4).child(0).text(),":",1)); //商业比例
System.out.println(getSplitValue(element.child(4).child(1).text(),":",1)); // 建筑密度
System.out.println(getSplitValue(element.child(5).child(0).text(),":",1)); //限制高度
System.out.println(getSplitValue(element.child(5).child(1).text(),":",1)); //出让形式
System.out.println(getSplitValue(element.child(6).child(0).text(),":",1)); //出让年限
System.out.println(getSplitValue(element.child(6).child(1).text(),":",1)); //位置
System.out.println(getSplitValue(element.child(7).child(0).getElementsByAttribute("title").text(),":",1)); //标题
System.out.println(getSplitValue(element.child(7).child(1).child(1).text(),">>",0)); //规划用途

System.out.println("=========================");
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
new test().analysePage("http://land.fang.com/market/________1_0_1.html",1,1);
// getDownloadUrl("http://land.fang.com/market/37eae58c-c701-4e4f-b1af-3e0c8e3be1c6.html");
}

public static String getSplitValue(String value,String cha,int index){
String [] strings = value.split(cha);
if (strings.length>index){
return strings[index].trim();
}else {
return strings[0].trim();
}

}
}

更多相关文章

  1. 自定义标签 Unable to find setter method for attribute
  2. Java标准标签库学习小结
  3. java解析xml问题:如何获得一级标签下全部内容?
  4. 实现<table>标签的动态新增和后台接受<table>标签的方法
  5. Java:创建程序以查找圆柱体的表面积和体积

随机推荐

  1. android学习之LinearLayout
  2. Android版本与Linux内核的关系
  3. 让Activity变成一个窗口
  4. Android(安卓)Studio 运行出现 Multiple
  5. Android学习资源
  6. android中的自定义控件
  7. Android 源码分析 —— 从 Toast 出发
  8. android ddms查看线程
  9. Android ApiDemos示例解析(167):Views->L
  10. traceview进行Android性能测试