import requests import trafilatura def url_read(url): try: # 用 requests 控制超时,更稳定 response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}) response.raise_for_status() html = response.text # 提取正文 text = trafilatura.extract(html, favor_precision=True) return text if text else "无法提取网页内容" except Exception as e: print(f"抓取失败: {url}, 错误: {str(e)}") return "" def searxng_search(key_word, page=1): # 配置参数 base_url = "http://10.144.144.2:8888/search" params = { "q": key_word, # 搜索关键词 "format": "json", # 返回 JSON 格式 "pageno": page, "engines": "360search" } docs = [] try: # 发送 GET 请求 response = requests.post(base_url, params=params, timeout=10) response.raise_for_status() # 检查 HTTP 错误状态码 # 解析 JSON 响应 data = response.json() # 提取搜索结果(以网页结果为例) results = data.get("results", []) for idx, result in enumerate(results, 1): url = result.get('url', '') docs.append(url) print(f"标题:{result.get('title', '')}") except requests.exceptions.RequestException as e: print(f"请求失败:{e}") except ValueError as e: print(f"解析 JSON 失败:{e}") return docs def search(key_word, max_num = 5, only_tap = 1): key_word = "托拉姆物语" + key_word docs = [] url_count = 0 now_page = 1 while url_count < max_num: urls = searxng_search(key_word, now_page) now_page += 1 for url in urls: if only_tap == 1: if ("taptap" in url) and (url not in docs): docs.append(url) url_count += 1 else: if url not in docs: docs.append(url) url_count += 1 if url_count >= max_num: break return docs def read_all_urls(urls): ans = "" for url in urls: data = url_read(url) ans += f"网页地址:{url}读取到的内容如下:\n" + data ans += "\n" return ans if __name__ == "__main__": urls = search("托拉姆物语+压血魔导", 20, 1) print(urls) data = url_read(urls[0]) print(data)