| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- import requests
- import trafilatura
- def url_read(url):
- try:
- # 用 requests 控制超时,更稳定
- response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
- response.raise_for_status()
- html = response.text
- # 提取正文
- text = trafilatura.extract(html, favor_precision=True)
- return text if text else "无法提取网页内容"
- except Exception as e:
- print(f"抓取失败: {url}, 错误: {str(e)}")
- return ""
- def searxng_search(key_word, page=1):
- # 配置参数
- base_url = "http://10.144.144.2:8888/search"
- params = {
- "q": key_word, # 搜索关键词
- "format": "json", # 返回 JSON 格式
- "pageno": page,
- "engines": "360search"
- }
- docs = []
- try:
- # 发送 GET 请求
- response = requests.post(base_url, params=params, timeout=10)
- response.raise_for_status() # 检查 HTTP 错误状态码
- # 解析 JSON 响应
- data = response.json()
- # 提取搜索结果(以网页结果为例)
- results = data.get("results", [])
- for idx, result in enumerate(results, 1):
- url = result.get('url', '')
- docs.append(url)
- print(f"标题:{result.get('title', '')}")
- except requests.exceptions.RequestException as e:
- print(f"请求失败:{e}")
- except ValueError as e:
- print(f"解析 JSON 失败:{e}")
- return docs
- def search(key_word, max_num = 5, only_tap = 1):
- key_word = "托拉姆物语" + key_word
- docs = []
- url_count = 0
- now_page = 1
- while url_count < max_num:
- urls = searxng_search(key_word, now_page)
- now_page += 1
- for url in urls:
- if only_tap == 1:
- if ("taptap" in url) and (url not in docs):
- docs.append(url)
- url_count += 1
- else:
- if url not in docs:
- docs.append(url)
- url_count += 1
- if url_count >= max_num:
- break
- return docs
- def read_all_urls(urls):
- ans = ""
- for url in urls:
- data = url_read(url)
- ans += f"网页地址:{url}读取到的内容如下:\n" + data
- ans += "\n"
- return ans
- if __name__ == "__main__":
- urls = search("托拉姆物语+压血魔导", 20, 1)
- print(urls)
- data = url_read(urls[0])
- print(data)
|