search.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. import requests
  2. import trafilatura
  3. def url_read(url):
  4. try:
  5. # 用 requests 控制超时,更稳定
  6. response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
  7. response.raise_for_status()
  8. html = response.text
  9. # 提取正文
  10. text = trafilatura.extract(html, favor_precision=True)
  11. return text if text else "无法提取网页内容"
  12. except Exception as e:
  13. print(f"抓取失败: {url}, 错误: {str(e)}")
  14. return ""
  15. def searxng_search(key_word, page=1):
  16. # 配置参数
  17. base_url = "http://10.144.144.2:8888/search"
  18. params = {
  19. "q": key_word, # 搜索关键词
  20. "format": "json", # 返回 JSON 格式
  21. "pageno": page,
  22. "engines": "360search"
  23. }
  24. docs = []
  25. try:
  26. # 发送 GET 请求
  27. response = requests.post(base_url, params=params, timeout=10)
  28. response.raise_for_status() # 检查 HTTP 错误状态码
  29. # 解析 JSON 响应
  30. data = response.json()
  31. # 提取搜索结果(以网页结果为例)
  32. results = data.get("results", [])
  33. for idx, result in enumerate(results, 1):
  34. url = result.get('url', '')
  35. docs.append(url)
  36. print(f"标题:{result.get('title', '')}")
  37. except requests.exceptions.RequestException as e:
  38. print(f"请求失败:{e}")
  39. except ValueError as e:
  40. print(f"解析 JSON 失败:{e}")
  41. return docs
  42. def search(key_word, max_num = 5, only_tap = 1):
  43. key_word = "托拉姆物语" + key_word
  44. docs = []
  45. url_count = 0
  46. now_page = 1
  47. while url_count < max_num:
  48. urls = searxng_search(key_word, now_page)
  49. now_page += 1
  50. for url in urls:
  51. if only_tap == 1:
  52. if ("taptap" in url) and (url not in docs):
  53. docs.append(url)
  54. url_count += 1
  55. else:
  56. if url not in docs:
  57. docs.append(url)
  58. url_count += 1
  59. if url_count >= max_num:
  60. break
  61. return docs
  62. def read_all_urls(urls):
  63. ans = ""
  64. for url in urls:
  65. data = url_read(url)
  66. ans += f"网页地址:{url}读取到的内容如下:\n" + data
  67. ans += "\n"
  68. return ans
  69. if __name__ == "__main__":
  70. urls = search("托拉姆物语+压血魔导", 20, 1)
  71. print(urls)
  72. data = url_read(urls[0])
  73. print(data)