记一次python爬虫实战,豆瓣电影Top250爬虫
16lz
2021-01-22
1 import requests 2 from bs4 import BeautifulSoup 3 import re 4 import traceback 5
6 def GetHtmlText(url): 7 for i in range(0,1): #尝试两次
8 try: 9 r=requests.get(url) 10 r.encoding = 'utf-8'
11 r.raise_for_status(); 12 return r.text; 13 except: 14 traceback.print_exc() 15 continue
16 return
17
18 def GetMovieInfo(url): 19 movieDict={} 20 for page in range(0,10): 21 try: 22 page_url = '?start='+str(page*25) 23 html = GetHtmlText(url+page_url) 24 Soup = BeautifulSoup(html, 'html.parser') 25 movie = Soup.find(name="ol",class_='grid_view') #所有电影信息
26 movieList = movie.find_all(name='li') #电影信息列表
27 for single in movieList: #循环单页的电影信息
28 num = single.find(name='em').string #电影排名
29 title1 = single.find_all(name='span',class_='title') 30 title2 = single.find(name='span',class_='other').string 31 if len(title1)==2: 32 movieTitle = title1[0].string+title1[1].string+title2.string 33 else: 34 movieTitle = title1[0].string+title2.string 35 classBD = single.find(name='div',class_='bd').contents #我也不知道为什么bs给我返回7个节点
36 movieActor = classBD[1].text 37 movieRating = re.findall(r'\d?\.\d?',str(classBD[3]))[0] 38 movieQuote = classBD[5].text 39 movieDict['num'] = num 40 movieDict['movieTitle'] = movieTitle 41 movieDict['actor'] = movieActor 42 movieDict['rating'] = movieRating 43 movieDict['quote'] = movieQuote 44 printMovieInfo(movieDict) 45 except: 46 traceback.print_exc() 47
48
49 def printMovieInfo(Info): 50 try: 51 with open('/home/why/py/movieInfo.txt','a',encoding='utf-8') as f: 52 f.write(str(Info['num']+Info['movieTitle']+'\n'+Info['actor']+'\n评分:'+Info['rating']+'\n评价:'+Info['quote']+'\n')) 53 except: 54 traceback.print_exc() 55
56
57 def main(): 58 base_url = 'https://movie.douban.com/top250'
59 GetMovieInfo(base_url) 60 main()
更多相关文章
- 运用Python语言编写获取Linux基本系统信息(三):Python与数据库编
- 使用python实现一个简单的学生信息管理系统
- Python脚本如何获取当前环节和用户等信息
- 使用/proc/meminfo文件查看内存状态信息
- 通过指令“ps -l”查看进程信息
- 为什么函数在ELF中的其他共享库的长度信息?
- 如何从PHP的mail()失败中获取额外的错误信息?
- Linux下CPU显示信息解释
- linux查看硬件信息及驱动设备相关整理