main.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. import os
  2. import asyncio
  3. import re
  4. import shutil
  5. from flask import Flask, request, jsonify
  6. from crawl4ai import AsyncWebCrawler
  7. from crawl4ai.async_configs import BrowserConfig
  8. import sys
  9. import io
  10. from search import url_read
  11. # 设置标准输出的编码为UTF-8
  12. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
  13. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
  14. # 设置临时目录环境变量
  15. TEMP_DIR = os.path.join(os.getcwd(), "playwright_temp")
  16. os.environ['PLAYWRIGHT_TEMP_DIR'] = TEMP_DIR
  17. # 确保临时目录存在
  18. os.makedirs(TEMP_DIR, exist_ok=True)
  19. app = Flask(__name__)
  20. API_KEY = "gxx12138_test"
  21. browser_config = BrowserConfig(proxy="socks5://127.0.0.1:20170")
  22. # 工具函数:清理临时目录
  23. def cleanup_temp_dir():
  24. try:
  25. if os.path.exists(TEMP_DIR):
  26. shutil.rmtree(TEMP_DIR)
  27. os.makedirs(TEMP_DIR)
  28. return True
  29. except Exception as e:
  30. print(f"清理临时目录失败: {e}")
  31. return False
  32. # 网页内容读取函数,用来爬网站文本内容
  33. async def cr4_add_search(key_word):
  34. async with AsyncWebCrawler(config=browser_config) as crawler:
  35. try:
  36. result = await crawler.arun(
  37. url=f"https://duckduckgo.com/?q=托拉姆+{key_word}",
  38. )
  39. text = str(result.markdown)
  40. pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
  41. matches = pattern.findall(text)
  42. search_results = []
  43. for title, url in matches:
  44. url_str = str(url).lower()
  45. title_str = str(title)
  46. if 'taptap' in url_str or 'duck' in url_str or title_str.startswith("http"):
  47. continue
  48. truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
  49. search_results.append({
  50. 'url': url,
  51. 'title': truncated_title
  52. })
  53. return {'data': search_results}
  54. finally:
  55. cleanup_temp_dir()
  56. async def cr4_search(ask, max_results):
  57. async with AsyncWebCrawler() as crawler:
  58. try:
  59. result = await crawler.arun(
  60. url=f"https://www.taptap.cn/app/42890/topic/search/{ask}",
  61. )
  62. text = str(result.markdown)
  63. moment_urls = re.findall(r'(https?://www.taptap.cn/moment/\d+)', text)
  64. unique_urls = list(set(moment_urls))
  65. return "\n".join(unique_urls[:max_results])
  66. finally:
  67. cleanup_temp_dir()
  68. async def search_in_duck(key_word):
  69. async with AsyncWebCrawler(config=browser_config) as crawler:
  70. try:
  71. result = await crawler.arun(
  72. url=f"https://duckduckgo.com/?t=ffab&q={key_word}&ia=web",
  73. )
  74. text = str(result.markdown)
  75. pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
  76. matches = pattern.findall(text)
  77. search_results = []
  78. for title, url in matches:
  79. url_str = str(url).lower()
  80. title_str = str(title)
  81. if 'duck' in url_str or title_str.startswith("http"):
  82. continue
  83. truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
  84. search_results.append({
  85. 'url': url,
  86. 'title': truncated_title
  87. })
  88. return {'data': search_results}
  89. finally:
  90. cleanup_temp_dir()
  91. @app.route('/search', methods=['POST'])
  92. def search():
  93. try:
  94. key = request.headers.get('Authorization')
  95. if key != API_KEY:
  96. return jsonify({'error': 'Invalid API key'}), 403
  97. data = request.json
  98. word = data.get('word')
  99. num = data.get('num', 10)
  100. add = data.get('add', False)
  101. if not word:
  102. return jsonify({'error': 'Missing "word" parameter'}), 400
  103. if add:
  104. result = asyncio.run(cr4_add_search(word))
  105. else:
  106. result = asyncio.run(cr4_search(word, num))
  107. print("------------search_result-------------")
  108. print(result)
  109. if isinstance(result, dict):
  110. return jsonify(result)
  111. else:
  112. return jsonify({'data': result})
  113. except Exception as e:
  114. print(f"处理请求时出错: {e}")
  115. return jsonify({'error': 'Internal server error'}), 500
  116. @app.route('/duck', methods=['POST'])
  117. def searchInDuck():
  118. try:
  119. key = request.headers.get('Authorization')
  120. if key != API_KEY:
  121. return jsonify({'error': 'Invalid API key'}), 403
  122. data = request.json
  123. word = data.get('word')
  124. if not word:
  125. return jsonify({'error': 'Missing "word" parameter'}), 400
  126. result = asyncio.run(search_in_duck(word))
  127. print("------------search_result-------------")
  128. print(result)
  129. if isinstance(result, dict):
  130. return jsonify(result)
  131. else:
  132. return jsonify({'data': result})
  133. except Exception as e:
  134. print(f"处理请求时出错: {e}")
  135. return jsonify({'error': f'Internal server error:{e}'}), 500
  136. @app.route('/read_url', methods=['GET'])
  137. def read_url():
  138. try:
  139. url = request.args.get('url')
  140. return jsonify({'data': url_read(url)})
  141. except Exception as e:
  142. print(f"处理请求时出错: {e}")
  143. return jsonify({'error': f'Internal server error:{e}'}), 500
  144. @app.teardown_appcontext
  145. def cleanup(res_or_exc):
  146. cleanup_temp_dir()
  147. return res_or_exc
  148. if __name__ == '__main__':
  149. app.run(host='0.0.0.0', port=6218)