1、 概述

本博客纯属原创,如有转载,请注明作者
运行环境:python3.5

所需模块:bs4 ,queue.thread,pymysql,requests,大家如果想运行此代码,只需要将我标粗的部分修改即可。

2、具体内容

2、1导入具体模块

###导入具体模块
import requests
from bs4 import BeautifulSoup
import re
from collections import deque
import sys
import numpy as np
import jieba
import threading
from threading import current_thread,Lock
from time import ctime ,sleep
import pymysql
import json
import urllib
import math
import queue

2、2定义线程类

class MyThread(threading.Thread):
def __init__(self, funcs, args, name=''):
threading.Thread.__init__(self)
self.funcs = funcs
self.name = name
self.args = args

def run(self):
self.funcs(*self.args)

2、3接下来就是重点了

###接下来就是爬取网页了
def getContent(que):
while que:
try:
url = que.popleft()
print('正在爬的线程是'+current_thread().name+"爬的是"+url)
**headers** = {
'Accept': 'text / html, application / xhtml + xml, image / jxr, * / *',
'Accept - Encoding': 'gzip, deflate',
'Accept - Language': 'zh - Hans - CN, zh - Hans;q = 0.5',
'User - Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome/51.0.2704.79 Safari/ 537.36Edge/14.14393',
'Connection': 'Keep - Alive'
}
req = requests.get(url, headers=headers)
req.encoding = 'gbk'
res = req.text
bs = BeautifulSoup(res)

bs = bs.find_all('div', class_='i-item')
total = []
for i in bs:
reg1 = i.find('dd')
reg2 = i.find('span', class_=re.compile(r'^sta'))
scores = reg2.get('class')
scores = ''.join(scores)
scores = str(scores)

if scores == 'starsa5' or scores == 'starsa4' or scores == 'starsa3':
j = 1
#
# j=1
#
if scores == 'starsa2' or scores == 'starsa1' or scores == 'starsa0':
j = 0

reg3 = re.compile("<[^>]*>")
content = reg3.sub('', reg1.prettify())
print(content)
total.append((content, scores, j))
**db = pymysql.connect('localhost', 'root', '**********', 'test')**
db.encoding = 'utf-8'
cursor = db.cursor()
cursor.execute('set names utf8')
sql = "INSERT INTO newjd (comment,scores ) VALUES ('%s','%s') "
sql2 = "INSERT INTO test1 (title) VALUES ('%s') "
cursor.execute(sql % (content, j))
db.commit()
cursor.close()
db.close()
sleep(3)
except Exception:
print('运行出错')

2、4运行

###在这里我用了四个线程
def main():
que = deque()
visited = set()
id = [549056]####可自行选择商品id
commentpeypage = 30

for i in id:

itemsummaryurl = 'http://club.jd.com/ProductPageService.aspx?method=GetCommentSummaryBySkuId&referenceId=' + str(
i)
itemsummaryresponse = urllib.request.urlopen(itemsummaryurl)

itemsummaryjson_dict = json.loads(itemsummaryresponse.read().decode('utf-8'))

commentrange = int(math.ceil(itemsummaryjson_dict.get('CommentCount')) / commentpeypage)
for j in range(commentrange):
url = 'http://club.jd.com/review/' + str(i) + '-0-' + str(j) + '-0.html'
que.append(url)
thread=[]
for i in range(4):
t = MyThread(getContent, (que, ), name='thread' + str(i))
thread.append(t)
for i in range(4):
thread[i].start()
for i in range(4):
thread[i].join()

if __name__ =='__main__':

main()

更多相关文章

  1. 利用读写锁实现sqlite多线程写的问题
  2. python使用MySQLdb模块连接MySQL
  3. 线程往数据库里插数据时偶尔会报错
  4. SQLite3使用总结备忘(多线程/WAL/锁等)
  5. Android模块化开发探索
  6. Android之用Handler实现主线程和子线程互相通信以及子线程和子线
  7. 《Android 创建线程源码与OOM分析》
  8. 重新认识Java线程的概念
  9. java线程实现与进程(二)

随机推荐

  1. android inputmanager中事件的传递流程
  2. android导航设计
  3. Android异步消息框架
  4. android中如何给button加圆角
  5. Android实现打电话功能
  6. Android Activity 常用功能设置(全屏、横
  7. android中apk反编译
  8. Android 图像处理资料
  9. Android 获取控件宽高
  10. android发送短信