记录博客 ZH-BLOG

Python 从 Web 抓取信息

时间:2018-07-20 14:51:58分类:python

webbrowser 模块的 open()函数可以启动一个新浏览器,打开指定的 URL

>>> import webbrowser
>>> webbrowser.open('www.baidu.com')

从命令行或剪贴板中获取地址信息,然后打开浏览器百度地图,自动加载地址

import sys,webbrowser,pyperclip
# 启动浏览器并打开baidu.com
webbrowser.open("www.baidu.com")
if len(sys.argv)>1:
	params=sys.argv[1:] # 获取命令行参数
	address=' '.join(params)    # 将列表用' '连接成字符串
else:
	address=pyperclip.paste()   # 从粘贴板中获取参数
webbrowser.open(r'http://map.baidu.com/?newmap=1&s=s%26wd%3D'+address+'%26c%3D28&from=alamap&tpl=mapcity')

用 requests 模块从 Web 下载文件

import requests
res=requests.get('https://.../xiaoshuo/44/44376/11931132.html')
print(type(res))
print(res.status_code==requests.codes.ok)	# 请求状态
print(len(res.text))
res.encoding='gbk'	# res会自动识别编码,若识别识别乱码,则手动设置编码。上面请求的页面编码为 gbk。
print(res.text[:100])

raise_for_status()方法:如果下载文件出错,将抛出异常。如果下载成功,就什么也不做。

res=requests.get('https://abc/xiaoshuo/44/44376/11931132a.html')	# 此路径不存在
try:
	res.raise_for_status()
except:
	print('next!!!')
print('不try则抛出异常不再运行')

next!!!
不try则抛出异常不再运行

下载文件并保存在硬盘

import requests,os
res=requests.get('https://.../xiaoshuo/44/44376/11931132.html')	# 下载html页面
res.raise_for_status()
file=open(os.path.join('E:\\','path','web.txt'),'wb')	# 重命名为web.txt
for chunk in res.iter_content(100000):	# 每次写入100000字节
	file.write(chunk)
file.close()

用 BeautifulSoup 模块解析 HTML

import requests,bs4,os
# 解析链接
url='https://.../xiaoshuo/44/44376/11931132.html'
res=requests.get(url)
res.raise_for_status()
soup=bs4.BeautifulSoup(res.text,"lxml")	# 指定解析器
print(type(soup))

# 解析本地文件
file=open(os.path.join('E:\\','path','web.html'))
soup=bs4.BeautifulSoup(file,"lxml")
print(type(soup))
ele=soup.select('.novel h1')
print(len(ele))
title=ele[0].get_text()	# 获取元素间的文本内容
print(title)
content=soup.select('.novel .yd_text2')[0].getText()
attrs=soup.select('.novel .yd_text2')[0].attrs	# 获取元素的属性列表
print(attrs)
print(content)
parts=soup.select('.novel .pereview a')
print(len(parts))
for i in range(0,len(parts)):
	href=parts[i].get('href')	# 获取元素的href属性值
	print(href,end='---')
	target=parts[i].get('target')	# 获取元素的target属性值
	print(target,end='---')
	hclass=parts[i].get('class')	# 获取元素的class属性值
	print(hclass)
	print(parts[i].getText()+'---'+href+'---'+target+'---'+str(hclass))

打开页面的所有的链接

import webbrowser,requests,bs4
url='https://.../xiaoshuo/44/44376/'
res=requests.get(url)
res.raise_for_status()
print(res.encoding) # ISO-8859-1 (原网页使用的是gbk)
res.encoding='gbk'  # 将内容编码修改为页面编码
soup=bs4.BeautifulSoup(res.text,"lxml")
eles=soup.select('.mulu ul li a')
print(soup.original_encoding)   # bs自动检测为 None
soup.prettify('gbk')    # bs 默认输入的编码是 utf-8,修改为页面编码输出
rmax=len(eles)
if rmax>10:	# 小说章节太多,同时打开10章测试
	rmax=10
for i in range(0,rmax):
	eles[i].encode('gbk')	# 每个元素都可以设定编码(上面已统一设定编码,此处可以不设置)
	href=eles[i].get('href')
	path=url+href
	print('open  '+eles[i].getText()+'   '+href)	# 若不在上面指定编码,输出会乱码
	webbrowser.open(path)	# 打开页面

# 下面针对乱码处理
# 参考资料:
# 1.requests
# When you make a request, Requests makes educated guesses about the encoding of the response based on the HTTP headers.
# The text encoding guessed by Requests is used when you access r.text. You can find out what encoding Requests is using,
# and change it, using the r.encoding property.
# 2.bs4
# Beautiful Soup uses a sub-library called Unicode, Dammit to detect a document’s encoding and convert it to Unicode.
# The autodetected encoding is available as the .original_encoding attribute of the BeautifulSoup object.Unicode,
# Dammit guesses correctly most of the time, but sometimes it makes mistakes. Sometimes it guesses correctly,
# but only after a byte-by-byte search of the document that takes a very long time. If you happen to know a document’s encoding ahead of time,
# you can avoid mistakes and delays by passing it to the BeautifulSoup constructor as from_encoding.
# 参数错误:UserWarning: You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.
#  warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")

# 通过Beautiful Soup输出文档时,不管输入文档是什么编码方式,输出编码均为UTF-8编码,如果不想用UTF-8编码输出,可以将编码方式传入 prettify() 方法.

下载漫画保存在硬盘内 

格式:漫画名(新建文件夹)\章节名(新建文件夹)\漫画.jpg

# 下载漫画
import os,bs4,requests,logging
from urllib.parse import urlparse	# url解析
from urllib.parse import urljoin	# url地址处理
logging.basicConfig(level=logging.DEBUG)	# 日志管理

# 下载图片方法
def downloadJpg(view_url,capter):
	logging.debug('下载页面:%s'%view_url)
	res=requests.get(view_url,timeout=request_time_out)	
	res.raise_for_status()
	soup=bs4.BeautifulSoup(res.text,'lxml')
	img_eles=soup.select('#main .comic-imgs img')
	for i in range(0,len(img_eles)):
		img_src=img_eles[i].get('data-kksrc')	# 漫画jpg真实地址需处理
		src=str(img_src).replace('amp;','')
		logging.debug('漫画图片地址:%s'%src)
		img = requests.get(src,timeout=request_time_out)	# 下载图片
		img_file = open(os.path.join(capter, str(i+1) + '.jpg'), 'wb')	
		logging.debug('漫画:%s' % (os.path.join(capter, str(i+1) + '.jpg')))
		for chunk in img.iter_content(100000):
			img_file.write(chunk)
		img_file.close()

path='http://www.kuaikanmanhua.com/web/topic/1745/'
request_time_out=5.0	# 设置请求超时时长5s
res=requests.get(path,timeout=request_time_out)
res.raise_for_status()
soup=bs4.BeautifulSoup(res.text,'lxml')
capters=soup.select('.article-list .table .tit a')
for i in range(0,len(capters)):
	href=capters[i].get('href')
	title=capters[i].get('title')
	capter=os.path.join('E:\\', 'path', '元尊', title)	# 每一章节创建一个文件夹
	os.makedirs(capter,exist_ok=True)
	logging.debug('创建文件夹:%s'%title)
	downloadJpg(urljoin(path,'../../../%s'%href),capter)	# urljoin处理得到正确页面,urljoin(path,'../../../%s'%href)相当于 http://www.kuaikanmanhua.com/href