加速爬虫

多进程分布式

准备

import multiprocessing as mp  
import time  
from urllib.request import urlopen, urljoin  
from bs4 import BeautifulSoup  
import re  

base_url = "https://morvanzhou.github.io/"  

restricted_crawl = False # 爬取限制取消

def crawl(url): #加载网页  
    response = urlopen(url)  
    #time.sleep(0.1) # 主动添加反应延迟（网速慢时可不添加）  
    return response.read().decode('utf-8')

def parse(html): #解析网页  
    soup = BeautifulSoup(html, 'lxml')  
    urls = soup.find_all('a', {'href': re.compile('^/.+?/$')})  
    title = soup.find('h1').get_text().strip()  
    page_urls = set([urljoin(base_url, url['href']) for url in urls])  
    url = soup.find('meta', {'property': 'og:url'})['content']  
    return title, page_urls, url

一般方法

unseen = set([base_url,])  
seen = set()  

count, t1 = 1, time.time()  

while len(unseen) != 0:  
    if restricted_crawl and len(seen) > 20:  
        break  

    print('\nDistributed Crawling.....')  
    htmls = [crawl(url) for url in unseen]  

    print('\nDistributed Parsing.....')  
    results = [parse(html) for html in htmls]  

    print('\nAnalysing.....')  
    seen.update(unseen)  
    unseen.clear()  

    for title, page_urls, url in results:  
        print(count, title, url)  
        count += 1  
        unseen.update(page_urls - seen)  

print("Total time: %.1fs" % (time.time() - t1, ))

多线程分布式

unseen = set([base_url,])  
seen = set()  

pool = mp.Pool(4)                         
count, t1 = 1, time.time()  
while len(unseen) != 0:                 # still get some url to visit  
    if restricted_crawl and len(seen) > 20:  
            break  
    print('\nDistributed Crawling...')  
    crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]  
    htmls = [j.get() for j in crawl_jobs]                                       # request connection  

    print('\nDistributed Parsing...')  
    parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]  
    results = [j.get() for j in parse_jobs]                                     # parse html  

    print('\nAnalysing...')  
    seen.update(unseen)         # seen the crawled  
    unseen.clear()              # nothing unseen  

    for title, page_urls, url in results:  
        print(count, title, url)  
        count += 1  
        unseen.update(page_urls - seen)     # get new url to crawl  
print('Total time: %.1f s' % (time.time()-t1, ))    # 16 s !!!

异步爬虫

普通

import time  

def job(t):  
    print('Start job', t)  
    time.sleep(t) # 设置延迟  
    print('Job', t, 'takes' , t, 's')  

def main():  
    [job(t) for t in range(1, 3)]  

t1 = time.time()  
main()  
print('No async total time: ', time.time() - t1)

异步

import asyncio  

async def job(t):  
    print('Start job ', t)  
    await asyncio.sleep(t)          # wait for "t" seconds, it will look for another job while await  
    print('Job ', t, ' takes ', t, ' s')  


async def main(loop):  
    tasks = [loop.create_task(job(t)) for t in range(1, 3)]     # just create, not run job  
    await asyncio.wait(tasks)                                   # run jobs and wait for all tasks done  

t1 = time.time()  
loop = asyncio.get_event_loop()  
loop.run_until_complete(main(loop))  
# loop.close()                          # Ipython notebook gives error if close loop  
print("Async total time : ", time.time() - t1)

普通爬取

import requests  
import time  
URL = 'https://morvanzhou.github.io/'  

def normal():  
    for i in range(2):  
        r = requests.get(URL)  
        url = r.url  
        print(url)  

t1 = time.time()  
normal()  
print('Normal total time', time.time() - t1)

异步爬取

import aiohttp # 异步加载网页模块  
import asyncio  
import time  
URL = 'https://morvanzhou.github.io/'  

async def job(session):  
    response = await session.get(URL)  
    return str(response.url)  

async def main(loop):  
    async with aiohttp.ClientSession() as session:  
        tasks = [loop.create_task(job(session)) for _ in range(2)]  
        finished, unfinished = await asyncio.wait(tasks)  
        all_results = [i.result() for i in finished]  
        print(all_results)  

t1 = time.time()  
loop = asyncio.get_event_loop()  
loop.run_until_complete(main(loop))  
loop.close()  
print("Async total time:", time.time() - t1)