java爬虫-初识

想找一些图片做桌面背景，但是又不想一张张去下载，后来就想到了爬虫。。。

对于爬虫我也没具体用过，在网上一顿搜索后写了个小demo。

爬虫的具体思路就是：

1.调用url爬取网页信息

2.解析网页信息

3.保存数据

刚开始还用正则去匹配，获取img标签中的src地址，但是发现有很多不便（主要我正则不太会），后来发现了jsoup这个神器。 jsoup 是一款Java 的HTML解析器，可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API，可通过DOM，CSS以及类似于jQuery的操作方法来取出和操作数据。

以下就用爬取图片为例：

import com.crawler.domain.PictureInfo;import org.bson.types.ObjectId;import org.springframework.data.mongodb.core.MongoTemplate;import org.springframework.data.mongodb.gridfs.GridFsTemplate;import org.springframework.stereotype.Service;import org.apache.commons.io.FileUtils;import org.apache.http.HttpEntity;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.springframework.util.DigestUtils;import org.springframework.util.StringUtils;import javax.annotation.Resource;import java.io.*;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * 爬虫实现 *@program: crawler * @description * @author: wl * @create: 2021-01-12 17:56 **/@Servicepublic class CrawlerService {  /**     * @param url      要抓取的网页地址     * @param encoding 要抓取网页编码     * @return     */    public String getHtmlResourceByUrl(String url, String encoding) {        URL urlObj = null;        HttpURLConnection uc = null;        InputStreamReader isr = null;        BufferedReader reader = null;        StringBuffer buffer = new StringBuffer();        // 建立网络连接        try {            urlObj = new URL(url);            // 打开网络连接            uc =(HttpURLConnection) urlObj.openConnection();　　　　　　　// 模拟浏览器请求            uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");            // 建立文件输入流            isr = new InputStreamReader(uc.getInputStream(), encoding);            // 建立缓存导入 将网页源代码下载下来            reader = new BufferedReader(isr);            // 临时            String temp = null;            while ((temp = reader.readLine()) != null) {// System.out.println(temp+"\n");                buffer.append(temp + "\n");            }            System.out.println("爬取结束:"+buffer.toString());        } catch (Exception e) {            e.printStackTrace();        } finally {            // 关流            if (isr != null) {                try {                    isr.close();                } catch (IOException e) {                    e.printStackTrace();                }            }        }        return buffer.toString();    }   /**     * 下载图片     *     * @param listImgSrc     */    public void Download(List<PictureInfo> listImgSrc) {        int count = 0;        try {            for (int i = 0; i < listImgSrc.size(); i++) {                try {                    PictureInfo pictureInfo = listImgSrc.get(i);                    String url=pictureInfo.getSrc();                    String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());                    URL uri = new URL(url);                    // 打开连接                    URLConnection con = uri.openConnection();                    //设置请求超时为                    con.setConnectTimeout(5 * 1000);                    con.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");                    // 输入流                    InputStream is = con.getInputStream();                    // 1K的数据缓冲                    byte[] bs = new byte[1024];                    // 读取到的数据长度                    int len;                    // 输出的文件流                    String src = url.substring(URL.length());                    int index = src.lastIndexOf('/');                    String fileName = src.substring(0, index + 1);                    File sf = new File(SAVE_PATH + fileName);                    if (!sf.exists()) {                        sf.mkdirs();                    }                    OutputStream os = new FileOutputStream(sf.getPath() + "\\" + imageName);                    System.out.println(++count + ".开始下载:" + url);                    // 开始读取                    while ((len = is.read(bs)) != -1) {                        os.write(bs, 0, len);                    }                    // 完毕，关闭所有链接                    os.close();                    is.close();                    System.out.println(imageName + ":--下载完成");                } catch (IOException e) {                    System.out.println("下载错误"+e);                }            }        } catch (Exception e) {            e.printStackTrace();            System.out.println("下载失败"+e);        }    }    /**     * 得到网页中图片的地址-推荐     * 使用jsoup     * @param htmlStr html字符串     * @return List<String>     */    public List<PictureInfo> getImgStrJsoup(String htmlStr) {        List<PictureInfo> pics = new ArrayList<PictureInfo>();        //获取网页的document树        Document imgDoc = Jsoup.parse(htmlStr);        //获取所有的img        Elements alts = imgDoc.select("img[src]");        for (Element alt : alts) {            PictureInfo p=new PictureInfo();            p.setSrc(alt.attr("src"));            p.setAlt(alt.attr("alt"));            p.setTitle(alt.attr("title"));            pics.add(p);        }        return pics;    }}

主要方法就这些，只要爬取下来的网页信息包含img标签，就能扒下其对应的图片。

最新2020整理收集的一些高频面试题（都整理成文档），有很多干货，包含mysql，netty，spring，线程，spring cloud、jvm、源码、算法等详细讲解，也有详细的学习规划图，面试题整理等，需要获取这些内容的朋友请加Q君样：909038429
/./*欢迎加入java交流Q君样：909038429一起吹水聊天

更多相关文章

随机推荐