searx

一、介绍

Searx 是一个免费的互联网元搜索引擎，集成各个搜索服务（例如 baidu google bind），可以自定义添加搜索引擎轻松集成。

二、如何自定义搜索引擎
1、在 engine 目录添加py文件，并编写 request（生成请求参数）和 response（格式化返回结果）方法，服务内置发送请求方法。模板如下：

  
categories = ['general']  # optional


def request(query, params):
    '''pre-request callback
    params<dict>:
      method  : POST/GET
      headers : {}
      data    : {} # if method == POST
      url     : ''
      category: 'search category'
      pageno  : 1 # number of the requested page
    '''

    params['url'] = 'https://host/%s' % query

    return params


def response(resp):
    '''post-response callback
    resp: requests response object
    '''
    return [{'url': '', 'title': '', 'content': ''}]

2、在settings.yml 配置文件添加引擎配置

三、源码分析
1、多线程获取搜索结果

    def search_multiple_requests(self, requests):
        search_id = uuid4().__str__()

        for engine_name, query, request_params in requests:
            th = threading.Thread(
                target=PROCESSORS[engine_name].search,
                args=(query, request_params, self.result_container, self.start_time, self.actual_timeout, engine_name),
                name=search_id,
            )
            th._timeout = False
            th._engine_name = engine_name
            th.start()
        for th in threading.enumerate():
            if th.name == search_id:
                remaining_time = max(0.0, self.actual_timeout - (time() - self.start_time))
                th.join(remaining_time)
                if th.is_alive():
                    th._timeout = True
                    self.result_container.add_unresponsive_engine(th._engine_name, 'timeout')
                    logger.warning('engine timeout: {0}'.format(th._engine_name))

2、加载搜索引擎

def load_engine(engine_data):
    engine_name = engine_data['name']
    if '_' in engine_name:
        logger.error('Engine name contains underscore: "{}"'.format(engine_name))
        sys.exit(1)

    if engine_name.lower() != engine_name:
        logger.warn('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name))
        engine_name = engine_name.lower()
        engine_data['name'] = engine_name

    engine_module = engine_data['engine']

    try:
        # 加载 引擎
        engine = load_module(engine_module + '.py', engine_dir)
    except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError):
        logger.exception('Fatal exception in engine "{}"'.format(engine_module))
        sys.exit(1)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

3、加载搜索引擎方法 load_module （插件的需求借鉴）


def load_module(filename, module_dir):
    modname = splitext(filename)[0]
    if modname in sys.modules:
        del sys.modules[modname]
    filepath = join(module_dir, filename)
    # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
    spec = importlib.util.spec_from_file_location(modname, filepath)
    module = importlib.util.module_from_spec(spec)
    sys.modules[modname] = module
    spec.loader.exec_module(module)
    return module

4、值得借鉴的地方
引擎必须继承该类，实现 search方法。调用搜索引擎时用的就是search方法

class EngineProcessor(ABC):
    @abstractmethod
    def search(self, query, params, result_container, start_time, timeout_limit):
        pass

start_new_thread(gc.collect, tuple()) 新增线程垃圾回收防止内存泄露

    def search_standard(self):
        """
        Update self.result_container, self.actual_timeout
        """
        requests, self.actual_timeout = self._get_requests()
        print(f"zsq 多线程 查询start {time()}")
        # send all search-request
        if requests:
            self.search_multiple_requests(requests)
            #开启一个新线程回收垃圾
            start_new_thread(gc.collect, tuple())
        print(f"多线程 查询end {time()}")
        # return results, suggestions, answers and infoboxes
        return True

searx

友情链接更多精彩内容