jsoup 分页抓取网页数据Java HTML Parser
16lz
2021-01-22
Example
Fetch theWikipediahomepage, parse it to a DOM, and select the headlines from theInthenewssection into a list ofElements(online sample):
Document doc = Jsoup.connect("http://en.wikipedia.org/").get();
Elements newsHeadlines = doc.select("#mp-itn b a");
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
public class test {
public List<String> analysePage(String url, int startPage, int endpage) throws Exception {
int endPage = 0;
List<String> links = new ArrayList<String>();
try
{
if (startPage<=1) {
url = "http://land.fang.com/market/________1_0_1.html";
}else {
url = "http://land.fang.com/market/________1_0_"+startPage+".html";
}
// 通过过滤器过滤出<A>标签
Parser parser = new Parser(url);
NodeList nodeList = parser
.extractAllNodesThatMatch(new NodeFilter()
{
// 实现该方法,用以过滤标签
public boolean accept(Node node)
{
if (node instanceof LinkTag)// 标记
return true;
return false;
}
});
// 打印
String tempPage ="";
for (int i = 97; i < nodeList.size(); i++)
{
LinkTag n = (LinkTag) nodeList.elementAt(i);
// System.out.print(n.getStringText() + " ==>> "+n.extractLink().length()+"=="+i+"==");
if(n.extractLink().length()==69&&n.extractLink().contains("http://land.fang.com/market/")){
links.add(n.extractLink());
System.out.println(n.extractLink());
}
String title = n.getStringText();
if(isNumeric(title)){
endPage = Integer.parseInt(title)+1;
}
if(isNumeric(tempPage)&&!isNumeric(title)){
break;
}
tempPage = title;
}
//System.out.print(endPage+"--2222--"+links.size());
}
catch (Exception e)
{
e.printStackTrace();
}
if (startPage < endpage&& endpage<=endPage) {
links.addAll(analysePage(url, startPage + 1, endpage));
}
for (int i=0;i<links.size();i++){
getData(links.get(i));
}
return links;
}
public static void getData(String introUrl){
try {
Document doc = Jsoup.connect(introUrl).get();
Elements newsHeadlines = doc.getElementsByClass("tablebox02 mt10");
Elements bianhao = doc.getElementsByClass("menubox01 mt20");
System.out.println(getSplitValue(bianhao.get(0).getElementsByTag("span").text(),":",1));
Element element = newsHeadlines.get(0).child(0);
System.out.println(element.child(0).child(0).child(1).text()); //地区
System.out.println(element.child(0).child(1).child(1).text()); //所在地
System.out.println(element.child(1).child(0).child(1).text()); //总面积
System.out.println(element.child(1).child(1).child(1).text()); // 建设用地面积
System.out.println(element.child(2).child(0).child(1).text()); //规划建筑面积
System.out.println(element.child(2).child(1).child(1).text()); //代征面积
System.out.println(getSplitValue(element.child(3).child(0).text(),":",1)); //容积率
System.out.println(getSplitValue(element.child(3).child(1).text(),":",1)); //绿化率
System.out.println(getSplitValue(element.child(4).child(0).text(),":",1)); //商业比例
System.out.println(getSplitValue(element.child(4).child(1).text(),":",1)); // 建筑密度
System.out.println(getSplitValue(element.child(5).child(0).text(),":",1)); //限制高度
System.out.println(getSplitValue(element.child(5).child(1).text(),":",1)); //出让形式
System.out.println(getSplitValue(element.child(6).child(0).text(),":",1)); //出让年限
System.out.println(getSplitValue(element.child(6).child(1).text(),":",1)); //位置
System.out.println(getSplitValue(element.child(7).child(0).getElementsByAttribute("title").text(),":",1)); //标题
System.out.println(getSplitValue(element.child(7).child(1).child(1).text(),">>",0)); //规划用途
System.out.println("=========================");
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
new test().analysePage("http://land.fang.com/market/________1_0_1.html",1,1);
// getDownloadUrl("http://land.fang.com/market/37eae58c-c701-4e4f-b1af-3e0c8e3be1c6.html");
}
public static String getSplitValue(String value,String cha,int index){
String [] strings = value.split(cha);
if (strings.length>index){
return strings[index].trim();
}else {
return strings[0].trim();
}
}
}
更多相关文章
- 自定义标签 Unable to find setter method for attribute
- Java标准标签库学习小结
- java解析xml问题:如何获得一级标签下全部内容?
- 实现<table>标签的动态新增和后台接受<table>标签的方法
- Java:创建程序以查找圆柱体的表面积和体积