记录博客 ZH-BLOG

Python 异步请求

时间:2018-08-26 23:34:47分类:python

1. 正常下载漫画,requests 阻塞式。

import requests, bs4, time
import asyncio, aiohttp

def get_image_urls(url):
    res = requests.get(url)
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    imgs = soup.select('div.list img')
    for img in imgs:
        yield img.get('data-kksrc')


def download_image(image_url, filename):
    res = requests.get(image_url)
    res.raise_for_status()
    image = res.content
    with open(r'e:\path\mh\{}.jpg'.format(filename), 'wb') as fp:
        fp.write(image)


if __name__ == '__main__':
    site_url = 'https://www.kuaikanmanhua.com/web/comic/105372/'
    t_url = 'https://www.kuaikanmanhua.com/web/comic/15559/'
    image_urls = [image_url for image_url in get_image_urls(site_url)]
    t_list = [image_url for image_url in get_image_urls(t_url)]
    image_urls.extend(t_list)
    start = time.time()
    for filename, url in enumerate(image_urls, start=1):
        download_image(url, filename)
    print('总计:{:.2f}s'.format(time.time()-start))

2. 异步包 asyncio 基本用法

import asyncio
import time

async def asynct():
    print('asyncio...')
    await asyncio.sleep(1)


def main():
    start = time.time()
    loop = asyncio.get_event_loop()
    tasks = [asynct() for i in range(10)]
    tasks_wait = asyncio.wait(tasks)
    loop.run_until_complete(tasks_wait)
    loop.close()
    print('total: {:.2f}s'.format(time.time()-start))


if __name__ == '__main__':
    main()

结果:

asyncio...
asyncio...
asyncio...
asyncio...
asyncio...
asyncio...
asyncio...
asyncio...
asyncio...
asyncio...
total: 1.01s

asyncio.sleep(1) 并不会阻塞运行,而是返回主线程继续运行,所以不会等待 10s 时间。


3. 异步下载漫画

import asyncio, aiohttp
import requests, bs4
import time
import tqdm
import concurrent.futures as futures

async def download_images(url, filename):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as res:
            image = await res.content.read()
            with open(r'e:\path\mh\{}.jpg'.format(filename), 'wb') as fp:
                fp.write(image)


def test():
    site_url = 'https://www.kuaikanmanhua.com/web/comic/105372/'
    t_url = 'https://www.kuaikanmanhua.com/web/comic/15559/'
    image_urls = [image_url for image_url in get_image_urls(site_url)]
    t_list = [image_url for image_url in get_image_urls(t_url)]
    image_urls.extend(t_list)
    start = time.time()
    loop = asyncio.get_event_loop()
    tasks = [download_images(url, filename) for filename, url in enumerate(image_urls, start=1)]
    coro = asyncio.wait(tasks)
    loop.run_until_complete(coro)
    loop.close()
    print('总计:{:.2f}s'.format(time.time() - start))


if __name__ == '__main__':
    test()

4. 以线程方式异步请求

def download_image(image_url, filename):
    res = requests.get(image_url)
    res.raise_for_status()
    image = res.content
    with open(r'e:\path\mh\{}.jpg'.format(filename), 'wb') as fp:
        fp.write(image)


def test1():
    site_url = 'https://www.kuaikanmanhua.com/web/comic/105372/'
    t_url = 'https://www.kuaikanmanhua.com/web/comic/15559/'
    image_urls = [image_url for image_url in get_image_urls(site_url)]
    t_list = [image_url for image_url in get_image_urls(t_url)]
    image_urls.extend(t_list)
    start = time.time()
    loop = asyncio.get_event_loop()
    for filename, url in enumerate(image_urls, start=1):
        loop.run_in_executor(None, download_image, url, filename)
    print('总计:{:.2f}s'.format(time.time() - start))


if __name__ == '__main__':
    test1()

这种方式是最快的,244张漫画图片只需要 0.11s。上面的方式大约都要 10s,奇怪的是 3 异步下载需要 9s 多,并没有比正常下载快多少。

5. 异步下载加入进度条,更好的观察下载进度

async def download_images(url, filename):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as res:
            image = await res.content.read()
            with open(r'e:\path\mh\{}.jpg'.format(filename), 'wb') as fp:
                fp.write(image)


async def download_images_tqdm(image_urls):
    tasks = [download_images(url, filename) for filename, url in enumerate(image_urls, start=1)]
    fs = asyncio.as_completed(tasks)
    tqdm_iter = tqdm.tqdm(fs, total=len(image_urls))
    for future in tqdm_iter:
        await future


def test2():
    site_url = 'https://www.kuaikanmanhua.com/web/comic/105372/'
    t_url = 'https://www.kuaikanmanhua.com/web/comic/15559/'
    image_urls = [image_url for image_url in get_image_urls(site_url)]
    t_list = [image_url for image_url in get_image_urls(t_url)]
    image_urls.extend(t_list)
    start = time.time()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(download_images_tqdm(image_urls))
    loop.close()
    print('总计:{:.2f}s'.format(time.time() - start))


if __name__ == '__main__':
    test2()

6. 使用 concurrent.futures 线程方式请求

def download_image2(*args):
    res = requests.get(args[0][1])
    res.raise_for_status()
    image = res.content
    with open(r'e:\path\mh\{}.jpg'.format(args[0][0]), 'wb') as fp:
        fp.write(image)


def test3():
    site_url = 'https://www.kuaikanmanhua.com/web/comic/105372/'
    t_url = 'https://www.kuaikanmanhua.com/web/comic/15559/'
    image_urls = [image_url for image_url in get_image_urls(site_url)]
    t_list = [image_url for image_url in get_image_urls(t_url)]
    image_urls.extend(t_list)
    start = time.time()
    with futures.ThreadPoolExecutor(max_workers=16) as executor:
        executor.map(download_image2, enumerate(image_urls, start=1))
    print('总计:{:.2f}s'.format(time.time() - start))


if __name__ == '__main__':
    test3()

这种方式根据 max_workers 最大线程数设置不同值,得到的结果差距在 1s 左右,总共耗时 3.95s,比异步请求要快。


总结:

各运行两次

正常下载: 9.44s, 9.87s

异步下载: 9.54s, 3.58s

线程方式异步下载: 0.11s, 0.23s

线程方式下载: 3.87s, 4.32s

最快且相对稳定的是 线程方式异步下载