Python爬取小说
16lz
2021-03-30
import requestsimport osimport re# https://www.17k.com/chapter/263899/5856183.html# 获取每章节下载链接的urldef get_toc(html): print('get url') to_url_list = [] toc_block = re.findall('class="tit">正文(.*?)BAIDU_banner_bottom', html, re.S)[0] toc_url = re.findall('href="(.*?)"', toc_block, re.S) start_url = 'https://www.17k.com' for url in toc_url[:-1]: to_url_list.append(start_url + url) return to_url_list# 获取标题和小说内容def get_article(html): print('get chapter and text') chapter_name = re.search('<h1>(.*?)</h1>', html, re.S).group(1) text_block = re.search('class="p">(.*?)<p class="copy ">', html, re.S).group(1) # print(chapter_name) # print(text_block.replace('<p>', '').replace('</p>', '')) text_content = text_block.replace('<p>', '').replace('</p>', '') save(chapter_name, re.sub('[ \t]', '', text_content)) # sub去除文章中的一大串空格# 将小说内容保存到电脑上def save(chapter, article): file_path = r'C:\Users\coremail\Desktop\爬虫\仙剑四' file_name = os.path.join(file_path, chapter + '.txt') os.makedirs(file_path, exist_ok=True) with open(file_name, 'w', encoding='utf-8') as f: f.write(article)# 仙剑四的urlurl = 'https://www.17k.com/list/263899.html'htmlContent = requests.get(url).content.decode('UTF-8')url_list = get_toc(htmlContent)for novel_url in url_list: print(novel_url) try: get_article(requests.get(novel_url).content.decode('UTF-8')) except Exception as e: print(e)# get_article(requests.get('https://www.17k.com/chapter/263899/5868069.html').content.decode('utf-8'))print('over')
更多相关文章
- 「公众号吸粉神级插件」实现网站下载文件需要公众号获取验证码
- 专栏 | 使用zabbix-agent2自定义插件获取https证书过期时间
- web前端怎么获取cookie?新手前端开发者需了解
- 带噪学习研究及其在内容审核业务下的工业级应用
- 深度社区新版论坛正式上线!
- 41款实用工具,数据获取、清洗、建模、可视化都有了
- Solidwork软件(license)许可证竟然还可以这样用!
- 多媒体内容理解在美图社区的应用实践
- linux使用zip修改文件内容