注:erroe:5** 服务器端错误 4**地址错误或错误发生在客户端
1.基本的网页下载
from urllib import request
from urllib.error import URLError
def download(url):
reponse= request.Request(url)
html = request.urlopen(response).read().decode('utf-8')
return html
2.当不能访问的链接过多时,需要对爬虫进行设置,当错误链接超过限定次数时,停止
def download(url, num_retried=2): print('download:',url) try: response = request.Request(url) html = request.urlopen(response).read().decode('utf-8') except error.URLError as e: print('Download Error:',e.reason) html =None if num_retried >0: if hasattr(e,'code') and 500 <=e.code <600: return download(url,num_retried-1) return html
3.有用户代理的重试下载def download(url, num_retried=2,user_agent = 'wswp'): print('download:',url)
headers = {'User-agent':user_agent} response = request.Request(url, headers=headers) try: html = request.urlopen(response).read() except error.URLError as e: print('downloadError',e.reason) html = None if num_retried > 0: if hasattr(e,'code') and 500<= e.code <600: download(url,user_agent,num_retried-1) return html 4.网络地图爬虫
def crawl_sitemap(url): sitemap = download(url) print(type(sitemap)) pattern = re.compile('(.*?) ') links = pattern.findall(str(sitemap)) for link in links: html = download(link) 5.ID遍历爬虫
import itertools def crawl_ID(url): for page in itertools.count(1): urls =url+'-%d' % page html = download(urls) if html is None: break else: pass 6.升级版网络爬虫,知道链接错误5次才会停止下载
def crawl_ID_1(url,max_error): num_error = 0 for page in itertools.count(1): urls = url+'-%d' % page html = download(urls) if html is None: num_error +=1 if num_error == max_error: break else: num_error = 0 7.链接爬虫
import re def link_crawler(seed_url, link_regex): crawl_queue = [seed_url] while crawl_queue: url = crawl_queue.pop() html = download(url) for link in get_links(html): if re.match(link_regex, link): crawl_queue.append(link) def get_links(html): webpage_regex = re.compile(' ] +href=["\'](.*?)["\']', re.IGNORECASE) return webpage_regex.findall(str(html)) 8.升级版链接爬虫,合成绝对链接
def link_crawler_1(seed_url, link_regex): crawl_queue = [seed_url] while crawl_queue: url = crawl_queue.pop() html = download(url) for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) crawl_queue.append(link)
def get_links(html): webpage_regex = re.compile(' ] +href=["\'](.*?)["\']', re.IGNORECASE) return webpage_regex.findall(str(html)) 9.解析robot.txt文件 在shell中进行
import urllib.robotparser rp =urllib.robotparser.RobotFileParser() rp.set_url('http://example.webscraping.com/robot.txt') rp.read() url = 'http://example.webscraping.com' user_agent = 'BadCrawler' rp.can_fetch(user_agent,url) 10.支持代理
from urllib import request from urllib.error import URLError def download(url,user_agent='wswp',proxy=None,num_retries=2): print('download',url) header = {'User-agent',user_agent} response = request.Request(url,headers=header) opener =urllib.request.build_opener(response) if proxy: proxy_params = {urlparse(url).scheme: proxy} opener.add_handler(urllib.request.ProxyHandler(proxy_params)) try: html = opener.open(response).read() except urllib.error.URLError as e: print('Download error:', e.reason) html =None if num_retries >0: if hasattr(e, 'code') and 500 <= e.code < 600: html = download(url, user_agent,proxy,num_retries-1) return html 11.下载限速
import datetime import time class Throttle: def __init__(self,delay): self.delay = delay self.domains = {} def wait(self,url): domain = urlparse(url).netloc last_accessed = self.domains.get(domain) if self.delay >0 and last_accessed is not None: sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds if sleep_secs > 0: time.sleep(sleep_secs) self.domains[domain] = datetime.datetime.now() 12.避免重复下载
def link_crawler_1(seed_url, link_regex): crawl_queue = [seed_url] seen= set[seed_url] while crawl_queue: url = crawl_queue.pop() html = download(url) for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in seen: seen.add(link) crawl_queue.append(link)
13.避免爬虫陷阱
def link_crawler(*args,max_depth): max_depth = 2 seen={} ... depth =seen[url] if depth != max_depth: for link in links: seen[link] = depth+1 crawl_queue.append(link) 总:融合上面各方法,就可以做出一个比较棒的爬虫