采集资源的方法(文字,图片) [图片]
16lz
2021-01-22
由于网络的开放性,我们浏览网站都会把数据发送到本地,这就造就了采集的环境.
之前研究采集一段时间了,在刘建的帮助下,终于可以把别人网站的内容采集到自己的网站上面显示出来,但是这样有一个很大的弊端,那就是如果被采集的网站关闭了,你的网站也因为采集不到内容而显示不了,解决这个问题的最好办法还是把采集到的数据存放到本地,这样就算别人挂了也对自己的网站没影响,经过和刘建的讨论,总结了如下采集流程,今天把它实现了.
根据流程图,代码如下:
using System; using System.Collections.Generic; using System.Web; using System.Web.UI; using System.Web.UI.WebControls; using System.Data.SqlClient; using System.IO; using System.Net; using System.Text.RegularExpressions; public partial class chapter : System.Web.UI.Page { protected string title, content,newContent, bookurl, readurl, provpage, nextpage, zhangjie, keywords, description; protected void Page_Load(object sender, EventArgs e) { bool hasbook = true;//书名是否存在 bool hassecction = true;//章节是否存在 string bookid = Request.QueryString["bookid"]; string sectionid = Request.QueryString["chapterid"]; jiangs_Tools.check_str(bookid); jiangs_Tools.check_str(sectionid); jiang_Db_Sql newdb = new jiang_Db_Sql(); string sql = "select count(*) from [book] where [bookid]=" + bookid; newdb.Open(); hasbook = newdb.Exec_Sql(sql); newdb.Close(); sql = "select count(*) from [section] where [sectionid]=" + sectionid; newdb.Open(); hassecction = newdb.Exec_Sql(sql); newdb.Close(); if (!hasbook)//书名不存在,添加 { string html = jiangs_Rex.GetRemoteHtmlCode("http://www.lovepd.com/chapter.php?cutid=" + Request.QueryString["cutid"] + "&bookid=" + Request.QueryString["bookid"] + "&chapterid=" + Request.QueryString["chapterid"]); html = html.Replace("http://www.lovepd.com/", ""); html = html.Replace("http://lovepd.com/", ""); html = jiangs_Rex.ReplaceListUrl(html, @"(chapter)\.php\?cutid=(\d+)&bookid=(\d+)&chapterid=(\d+)", 4); html = jiangs_Rex.ReplaceListUrl(html, @"(read)\.php\?cutid=(\d+)\&bookid=(\d+)", 3);//小说目录 content = jiangs_Rex.GetRegValue(@"(?<=<p align=""left"">)(.*?)(?=</p>)", html, 1);//内容 html = jiangs_Rex.ReplaceListUrl(html, @"readend\.php\?bookid=(\d+)", 1, "/read/" + Request.QueryString["cutid"]);//小说目录 content = jiangs_Rex.ReplaceContentImgUrl(content);//如果是图片小说,转换 content = content.Replace("第九文学www.d9123.com", ""); title = jiangs_Rex.GetRegValue(@"(\w+)最新章节列表</b>", html, 1);//小说名称 bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//简介url readurl = jiangs_Rex.GetRegValue(@"main_page = ""([^""]+)""", html, 1);//前一页url provpage = jiangs_Rex.GetRegValue(@"back_page = ""([^""]+)""", html, 1);//前一页url nextpage = jiangs_Rex.GetRegValue(@"next_page = ""([^""]+)""", html, 1);//下一页url zhangjie = jiangs_Rex.GetRegValue(@"(?<=<h1>)([^>]+)(?=</h1>)", html, 1);//章节 Response.Flush();//先输出内容,减少用户等待 newContent = content;//替换存到数据库的图片新路径 sql = string.Format("insert into [book]([bookId],[bookName]) values('{0}','{1}')", bookid, title); newdb.Open(); newdb.ExecSql(sql);//添加书名 newdb.Close(); string imgContent = jiangs_Rex.GetRegValue(@"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", content, "|", "");//取得内容图片 if (!string.IsNullOrEmpty(imgContent)) { savePic(imgContent);//保存图片 } sql = sql = string.Format("insert into [section]([sectionId],[bookId],[sectionTitle],[sectionContent],[readUrl],[provPage],[nextPage]) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}')", sectionid, bookid, zhangjie, newContent, readurl, provpage, nextpage); newdb.Open(); newdb.ExecSql(sql);//添加章节 newdb.Close(); } else if(!hassecction)//章节不存在 { string html = jiangs_Rex.GetRemoteHtmlCode("http://www.lovepd.com/chapter.php?cutid=" + Request.QueryString["cutid"] + "&bookid=" + Request.QueryString["bookid"] + "&chapterid=" + Request.QueryString["chapterid"]); html = html.Replace("http://www.lovepd.com/", ""); html = html.Replace("http://lovepd.com/", ""); html = jiangs_Rex.ReplaceListUrl(html, @"(chapter)\.php\?cutid=(\d+)&bookid=(\d+)&chapterid=(\d+)", 4); html = jiangs_Rex.ReplaceListUrl(html, @"(read)\.php\?cutid=(\d+)\&bookid=(\d+)", 3);//小说目录 content = jiangs_Rex.GetRegValue(@"(?<=<p align=""left"">)(.*?)(?=</p>)", html, 1);//内容 html = jiangs_Rex.ReplaceListUrl(html, @"readend\.php\?bookid=(\d+)", 1, "/read/" + Request.QueryString["cutid"]);//小说目录 content = jiangs_Rex.ReplaceContentImgUrl(content);//如果是图片小说,转换 content = content.Replace("第九文学www.d9123.com", ""); title = jiangs_Rex.GetRegValue(@"(\w+)最新章节列表</b>", html, 1);//小说名称 bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//简介url readurl = jiangs_Rex.GetRegValue(@"main_page = ""([^""]+)""", html, 1);//前一页url provpage = jiangs_Rex.GetRegValue(@"back_page = ""([^""]+)""", html, 1);//前一页url nextpage = jiangs_Rex.GetRegValue(@"next_page = ""([^""]+)""", html, 1);//下一页url zhangjie = jiangs_Rex.GetRegValue(@"(?<=<h1>)([^>]+)(?=</h1>)", html, 1);//章节 Response.Flush();//先输出内容,减少用户等待 newContent = content;//替换存到数据库的图片新路径 string imgContent = jiangs_Rex.GetRegValue(@"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", content, "|", "");//取得内容图片 if (!string.IsNullOrEmpty(imgContent)) { savePic(imgContent);//保存图片 } sql = sql = string.Format("insert into [section]([sectionId],[bookId],[sectionTitle],[sectionContent],[readUrl],[provPage],[nextPage]) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}')", sectionid, bookid, zhangjie, newContent,readurl,provpage,nextpage); newdb.Open(); newdb.ExecSql(sql);//添加章节 newdb.Close(); } else//章节,书名都存在,直接读数据库 { sql = "select a.[bookName],b.[sectionTitle],b.[sectionContent],b.[readUrl],b.[provPage],b.[nextPage] from [book] as a,[section] as b where a.[bookId]=b.[bookId] and b.[sectionId]=" + sectionid; newdb.Open(); SqlDataReader reader = newdb.Re_dr(sql); if (reader.Read()) { title = reader[0].ToString();//书名 zhangjie = reader[1].ToString();//章节名称 content = reader[2].ToString();//内容 readurl = reader[3].ToString();//书目 provpage=reader[4].ToString();//上一页 nextpage=reader[5].ToString();//下一页 bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//书页 } reader.Close(); newdb.Close(); } this.Page.Title = title + ">> " + zhangjie + " - 天下小说网"; keywords = "\"" + title + "最新章节列表," + title + "全文阅读," + title + "TXT电子书下载," + title + "JAR电子书下载," + title + "UMD电子书下载\""; description = "\"天下小说网为小说爱好者提供" + title + "最近更新章节阅读," + title + "全文在线阅读," + title + "最新章节电子书下载(包括" + title + "的TXT格式下载、" + title + "的JAR格式下载、" + title + "的UMD格式下载)\""; } public void savePic(string imgcontent) { if (string.IsNullOrEmpty(imgcontent)) { return; } imgcontent = imgcontent.Remove(0, 1);//去除第一个 | string[] temp = imgcontent.Split('|'); for (int i = 0; i < temp.Length; i++)//有几张图片就存几次 { string newUrl = temp[i];//重写URL newUrl= Regex.Replace(newUrl, @"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", "files/article/attachment/$1/$2/$3/$4.$5"); string[] tem = temp[i].Split('/'); string imgName = tem[tem.Length - 1];//图片名称 string picurl = Request.QueryString["url"]; picurl = "http://2.yxmimi.com/" + newUrl;//目标网站图片地址 WebClient objWebClient = new WebClient(); try { byte[] bResponse = objWebClient.DownloadData(picurl);//将下载数据保存到byte[]数组中 FileStream fs = new FileStream(Server.MapPath("/pic/section/"+imgName), FileMode.Create, FileAccess.Write); fs.Write(bResponse, 0, bResponse.Length);//将bytes[]数组中的图片数据保存到硬盘 fs.Flush(); fs.Close(); } catch (Exception ex) { //Response.Write( ex.ToString()); } } newContent = Regex.Replace(newContent, @"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", "/pic/section/$1_$2_$3_$4.$5"); } }
更多相关文章
- PHP限制HTML内容中图片必须是本站的方法
- PHP生成圆心图片-常用作头像圆图等场景
- php随机生成验证图片
- IIS7+PHP上传图片成功但却不能访问401.3
- php使用gd库将文字转换成图片
- php image函数,操作压缩图片时,png图片压缩后整个图片变黑
- PHP 使用 OSS 批量上传图片
- PHP实现图片上添加文字(证书生成)
- php图片上传并重命名图片的功能实现原理