由于网络的开放性,我们浏览网站都会把数据发送到本地,这就造就了采集的环境.

之前研究采集一段时间了,在刘建的帮助下,终于可以把别人网站的内容采集到自己的网站上面显示出来,但是这样有一个很大的弊端,那就是如果被采集的网站关闭了,你的网站也因为采集不到内容而显示不了,解决这个问题的最好办法还是把采集到的数据存放到本地,这样就算别人挂了也对自己的网站没影响,经过和刘建的讨论,总结了如下采集流程,今天把它实现了.


根据流程图,代码如下:

using System;
using System.Collections.Generic;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Data.SqlClient;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;

public partial class chapter : System.Web.UI.Page
{
    protected string title, content,newContent, bookurl, readurl, provpage, nextpage, zhangjie, keywords, description;
    protected void Page_Load(object sender, EventArgs e)
    {
        bool hasbook = true;//书名是否存在
        bool hassecction = true;//章节是否存在
        string bookid = Request.QueryString["bookid"];
        string sectionid = Request.QueryString["chapterid"];
        jiangs_Tools.check_str(bookid);
        jiangs_Tools.check_str(sectionid);
        jiang_Db_Sql newdb = new jiang_Db_Sql();
        string sql = "select count(*) from [book] where [bookid]=" + bookid;
        newdb.Open();
        hasbook = newdb.Exec_Sql(sql);
        newdb.Close();       
        sql = "select count(*) from [section] where [sectionid]=" + sectionid;
        newdb.Open();
        hassecction = newdb.Exec_Sql(sql);
        newdb.Close();
        if (!hasbook)//书名不存在,添加
        {
            string html = jiangs_Rex.GetRemoteHtmlCode("http://www.lovepd.com/chapter.php?cutid=" + Request.QueryString["cutid"] + "&bookid=" + Request.QueryString["bookid"] + "&chapterid=" + Request.QueryString["chapterid"]);
            html = html.Replace("http://www.lovepd.com/", "");
            html = html.Replace("http://lovepd.com/", "");
            html = jiangs_Rex.ReplaceListUrl(html, @"(chapter)\.php\?cutid=(\d+)&bookid=(\d+)&chapterid=(\d+)", 4);
            html = jiangs_Rex.ReplaceListUrl(html, @"(read)\.php\?cutid=(\d+)\&bookid=(\d+)", 3);//小说目录
            content = jiangs_Rex.GetRegValue(@"(?<=<p align=""left"">)(.*?)(?=</p>)", html, 1);//内容
            html = jiangs_Rex.ReplaceListUrl(html, @"readend\.php\?bookid=(\d+)", 1, "/read/" + Request.QueryString["cutid"]);//小说目录
            content = jiangs_Rex.ReplaceContentImgUrl(content);//如果是图片小说,转换
            content = content.Replace("第九文学www.d9123.com", "");
            title = jiangs_Rex.GetRegValue(@"(\w+)最新章节列表</b>", html, 1);//小说名称
            bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//简介url
            readurl = jiangs_Rex.GetRegValue(@"main_page = ""([^""]+)""", html, 1);//前一页url
            provpage = jiangs_Rex.GetRegValue(@"back_page = ""([^""]+)""", html, 1);//前一页url
            nextpage = jiangs_Rex.GetRegValue(@"next_page = ""([^""]+)""", html, 1);//下一页url
            zhangjie = jiangs_Rex.GetRegValue(@"(?<=<h1>)([^>]+)(?=</h1>)", html, 1);//章节
            Response.Flush();//先输出内容,减少用户等待

            newContent = content;//替换存到数据库的图片新路径
            sql = string.Format("insert into [book]([bookId],[bookName]) values('{0}','{1}')", bookid, title);
            newdb.Open();
            newdb.ExecSql(sql);//添加书名
            newdb.Close();
            string imgContent = jiangs_Rex.GetRegValue(@"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", content, "|", "");//取得内容图片
            if (!string.IsNullOrEmpty(imgContent))
            {
                savePic(imgContent);//保存图片
            }
            sql = sql = string.Format("insert into [section]([sectionId],[bookId],[sectionTitle],[sectionContent],[readUrl],[provPage],[nextPage]) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}')", sectionid, bookid, zhangjie, newContent, readurl, provpage, nextpage);
            newdb.Open();
            newdb.ExecSql(sql);//添加章节
            newdb.Close();
        }
        else if(!hassecction)//章节不存在
        {
            string html = jiangs_Rex.GetRemoteHtmlCode("http://www.lovepd.com/chapter.php?cutid=" + Request.QueryString["cutid"] + 
                "&bookid=" + Request.QueryString["bookid"] + "&chapterid=" + Request.QueryString["chapterid"]);
            html = html.Replace("http://www.lovepd.com/", "");
            html = html.Replace("http://lovepd.com/", "");
            html = jiangs_Rex.ReplaceListUrl(html, @"(chapter)\.php\?cutid=(\d+)&bookid=(\d+)&chapterid=(\d+)", 4);
            html = jiangs_Rex.ReplaceListUrl(html, @"(read)\.php\?cutid=(\d+)\&bookid=(\d+)", 3);//小说目录
            content = jiangs_Rex.GetRegValue(@"(?<=<p align=""left"">)(.*?)(?=</p>)", html, 1);//内容
            html = jiangs_Rex.ReplaceListUrl(html, @"readend\.php\?bookid=(\d+)", 1, "/read/" + Request.QueryString["cutid"]);//小说目录
            content = jiangs_Rex.ReplaceContentImgUrl(content);//如果是图片小说,转换
            content = content.Replace("第九文学www.d9123.com", "");
            title = jiangs_Rex.GetRegValue(@"(\w+)最新章节列表</b>", html, 1);//小说名称
            bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//简介url
            readurl = jiangs_Rex.GetRegValue(@"main_page = ""([^""]+)""", html, 1);//前一页url
            provpage = jiangs_Rex.GetRegValue(@"back_page = ""([^""]+)""", html, 1);//前一页url
            nextpage = jiangs_Rex.GetRegValue(@"next_page = ""([^""]+)""", html, 1);//下一页url
            zhangjie = jiangs_Rex.GetRegValue(@"(?<=<h1>)([^>]+)(?=</h1>)", html, 1);//章节
            Response.Flush();//先输出内容,减少用户等待

            newContent = content;//替换存到数据库的图片新路径
            string imgContent = jiangs_Rex.GetRegValue(@"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", content, "|", "");//取得内容图片
            if (!string.IsNullOrEmpty(imgContent))
            {
                savePic(imgContent);//保存图片
            }
            sql = sql = string.Format("insert into [section]([sectionId],[bookId],[sectionTitle],[sectionContent],[readUrl],[provPage],[nextPage]) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}')",
                sectionid, bookid, zhangjie, newContent,readurl,provpage,nextpage);
            newdb.Open();
            newdb.ExecSql(sql);//添加章节
            newdb.Close();
        }
        else//章节,书名都存在,直接读数据库
        {
            sql = "select a.[bookName],b.[sectionTitle],b.[sectionContent],b.[readUrl],b.[provPage],b.[nextPage] from [book] as a,[section] as b where a.[bookId]=b.[bookId] and b.[sectionId]=" + sectionid;
            newdb.Open();
            SqlDataReader reader = newdb.Re_dr(sql);
            if (reader.Read())
            {
                title = reader[0].ToString();//书名
                zhangjie = reader[1].ToString();//章节名称
                content = reader[2].ToString();//内容
                readurl = reader[3].ToString();//书目
                provpage=reader[4].ToString();//上一页
                nextpage=reader[5].ToString();//下一页
                bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//书页
            }
            reader.Close();
            newdb.Close();
        }
        this.Page.Title = title + ">> " + zhangjie + " - 天下小说网";
        keywords = "\"" + title + "最新章节列表," + title + "全文阅读," + title + "TXT电子书下载," + title + "JAR电子书下载," + title + "UMD电子书下载\"";
        description = "\"天下小说网为小说爱好者提供" + title + "最近更新章节阅读," + title + "全文在线阅读," + title
           + "最新章节电子书下载(包括" + title + "的TXT格式下载、" + title + "的JAR格式下载、" + title + "的UMD格式下载)\"";

    }
    public void savePic(string imgcontent)
    {
        if (string.IsNullOrEmpty(imgcontent))
        {
            return;
        }
        imgcontent = imgcontent.Remove(0, 1);//去除第一个 |
        string[] temp = imgcontent.Split('|');
        for (int i = 0; i < temp.Length; i++)//有几张图片就存几次
        {
            string newUrl = temp[i];//重写URL
            newUrl= Regex.Replace(newUrl, @"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", "files/article/attachment/$1/$2/$3/$4.$5");
            string[] tem = temp[i].Split('/');
            string imgName = tem[tem.Length - 1];//图片名称
            string picurl = Request.QueryString["url"];
            picurl = "http://2.yxmimi.com/" + newUrl;//目标网站图片地址
            WebClient objWebClient = new WebClient();
            try
            {
                byte[] bResponse = objWebClient.DownloadData(picurl);//将下载数据保存到byte[]数组中
                FileStream fs = new FileStream(Server.MapPath("/pic/section/"+imgName), FileMode.Create, FileAccess.Write);
                fs.Write(bResponse, 0, bResponse.Length);//将bytes[]数组中的图片数据保存到硬盘
                fs.Flush();
                fs.Close();

            }
            catch (Exception ex)
            {
                //Response.Write( ex.ToString());
            }
        }
        newContent = Regex.Replace(newContent, @"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", "/pic/section/$1_$2_$3_$4.$5");
    }

}


更多相关文章

  1. PHP限制HTML内容中图片必须是本站的方法
  2. PHP生成圆心图片-常用作头像圆图等场景
  3. php随机生成验证图片
  4. IIS7+PHP上传图片成功但却不能访问401.3
  5. php使用gd库将文字转换成图片
  6. php image函数,操作压缩图片时,png图片压缩后整个图片变黑
  7. PHP 使用 OSS 批量上传图片
  8. PHP实现图片上添加文字(证书生成)
  9. php图片上传并重命名图片的功能实现原理

随机推荐

  1. Android学习笔记:Android消息处理机制之Ha
  2. Android版本更新
  3. [Android5.1]开机动画显示工作流程分析
  4. 创建Android第一个工程
  5. 疯狂Android讲义下载
  6. Android快速开源框架--afinal
  7. android ------ AAPT2 error: check logs
  8. Android应用程序与SurfaceFlinger服务的
  9. Android 5将在第二季度发布
  10. Android TTS 实战一:认识 TTS