collect_links 源代码

# encoding: UTF-8
# 环境:py35(gevent)
"""
脚本功能:广度遍历采集链接(友情链接采集工具)

涉及:树的广度遍历(队列),代理模式,多线程,生成者消费者,跨线程数据共享-队列(阻塞)

帮助:python collect_links.py -h

使用示例:python collect_links.py -s 'https://hexo.yuanjh.cn' -suf '/links'

程序执行步骤

1,https://hexo.yuanjh.cn => (1,https://hexo.yuanjh.cn/links)

2,(1,https://hexo.yuanjh.cn/links) => [(2,http://xxx.yy.com/links),(2,https://zz.ff.cn/links)]

3,[(2,http://xxx.yy.com/links),(2,https://zz.ff.cn/links)]=> [(3,http://xxx.yy.zz/links),(3,https://zz.ff.zz/links)]

=> 循环此步骤

"""
import argparse
import re
import threading
from multiprocessing import cpu_count, Pool, Queue
from typing import List, Tuple, Callable
import requests
from contextlib2 import suppress


[文档]class UniqueQueue: """ 工具类,唯一性队列(Queue),同一个元素只能入队一次 :cvar int maxsize: 队列最大元素个数(队列为阻塞队列) :cvar callable key: 可调用函数,作用在item上用于产生唯一的key来做重复性判别,重复元素仅能入队一次(首次) """ def __init__(self, maxsize: int = 0, key: Callable = None) -> None: """初始化类实例.""" self.key = key self.queue = Queue(maxsize=maxsize) self.unique_set = set()
[文档] def put(self, item: Tuple[int,str]) -> bool: """ 向队列中添加新元素 :param: tuple item: item[0] url的深度,url链接地址 """ unique_key = self.key(item) if self.key else item return unique_key not in self.unique_set and self.queue.put(item) or self.unique_set.add(unique_key)
[文档] def get(self) -> Tuple[int, str]: """获取队列元素.""" return self.queue.get()
[文档] def empty(self) -> bool: """判断队列是否为空.""" return self.queue.empty()
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-s', '--seed_url', type=str, help='seed url address') parser.add_argument('-suf', '--suffix', type=str, help='suffix') parser.add_argument('-maxc', '--max_count', type=int, default=100, help='max count of url') parser.add_argument('-maxd', '--max_depth', type=int, default=10, help='max depth of url') args = parser.parse_args() collect_links = CollectLinks(seed_url=args.seed_url, suffix=args.suffix, max_count=args.max_count, max_depth=args.max_depth) collect_links.collect() print(collect_links.success_set)