|
|
@@ -0,0 +1,188 @@
|
|
|
+import os
|
|
|
+import asyncio
|
|
|
+import re
|
|
|
+import shutil
|
|
|
+from flask import Flask, request, jsonify
|
|
|
+from crawl4ai import AsyncWebCrawler
|
|
|
+from crawl4ai.async_configs import BrowserConfig
|
|
|
+import sys
|
|
|
+import io
|
|
|
+from search import url_read
|
|
|
+
|
|
|
+# 设置标准输出的编码为UTF-8
|
|
|
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
|
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
|
|
+# 设置临时目录环境变量
|
|
|
+TEMP_DIR = os.path.join(os.getcwd(), "playwright_temp")
|
|
|
+os.environ['PLAYWRIGHT_TEMP_DIR'] = TEMP_DIR
|
|
|
+
|
|
|
+# 确保临时目录存在
|
|
|
+os.makedirs(TEMP_DIR, exist_ok=True)
|
|
|
+
|
|
|
+app = Flask(__name__)
|
|
|
+API_KEY = "gxx12138_test"
|
|
|
+
|
|
|
+browser_config = BrowserConfig(proxy="socks5://127.0.0.1:20170")
|
|
|
+# 工具函数:清理临时目录
|
|
|
+def cleanup_temp_dir():
|
|
|
+ try:
|
|
|
+ if os.path.exists(TEMP_DIR):
|
|
|
+ shutil.rmtree(TEMP_DIR)
|
|
|
+ os.makedirs(TEMP_DIR)
|
|
|
+ return True
|
|
|
+ except Exception as e:
|
|
|
+ print(f"清理临时目录失败: {e}")
|
|
|
+ return False
|
|
|
+
|
|
|
+# 网页内容读取函数,用来爬网站文本内容
|
|
|
+
|
|
|
+async def cr4_add_search(key_word):
|
|
|
+ async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
+ try:
|
|
|
+ result = await crawler.arun(
|
|
|
+ url=f"https://duckduckgo.com/?q=托拉姆+{key_word}",
|
|
|
+ )
|
|
|
+
|
|
|
+ text = str(result.markdown)
|
|
|
+ pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
|
|
|
+ matches = pattern.findall(text)
|
|
|
+
|
|
|
+ search_results = []
|
|
|
+ for title, url in matches:
|
|
|
+ url_str = str(url).lower()
|
|
|
+ title_str = str(title)
|
|
|
+ if 'taptap' in url_str or 'duck' in url_str or title_str.startswith("http"):
|
|
|
+ continue
|
|
|
+
|
|
|
+ truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
|
|
|
+ search_results.append({
|
|
|
+ 'url': url,
|
|
|
+ 'title': truncated_title
|
|
|
+ })
|
|
|
+
|
|
|
+ return {'data': search_results}
|
|
|
+ finally:
|
|
|
+ cleanup_temp_dir()
|
|
|
+
|
|
|
+async def cr4_search(ask, max_results):
|
|
|
+ async with AsyncWebCrawler() as crawler:
|
|
|
+ try:
|
|
|
+ result = await crawler.arun(
|
|
|
+ url=f"https://www.taptap.cn/app/42890/topic/search/{ask}",
|
|
|
+ )
|
|
|
+
|
|
|
+ text = str(result.markdown)
|
|
|
+ moment_urls = re.findall(r'(https?://www.taptap.cn/moment/\d+)', text)
|
|
|
+ unique_urls = list(set(moment_urls))
|
|
|
+ return "\n".join(unique_urls[:max_results])
|
|
|
+ finally:
|
|
|
+ cleanup_temp_dir()
|
|
|
+
|
|
|
+async def search_in_duck(key_word):
|
|
|
+ async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
+ try:
|
|
|
+ result = await crawler.arun(
|
|
|
+
|
|
|
+ url=f"https://duckduckgo.com/?t=ffab&q={key_word}&ia=web",
|
|
|
+ )
|
|
|
+
|
|
|
+ text = str(result.markdown)
|
|
|
+ pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
|
|
|
+ matches = pattern.findall(text)
|
|
|
+
|
|
|
+ search_results = []
|
|
|
+ for title, url in matches:
|
|
|
+ url_str = str(url).lower()
|
|
|
+ title_str = str(title)
|
|
|
+ if 'duck' in url_str or title_str.startswith("http"):
|
|
|
+ continue
|
|
|
+
|
|
|
+ truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
|
|
|
+ search_results.append({
|
|
|
+ 'url': url,
|
|
|
+ 'title': truncated_title
|
|
|
+ })
|
|
|
+
|
|
|
+ return {'data': search_results}
|
|
|
+ finally:
|
|
|
+ cleanup_temp_dir()
|
|
|
+
|
|
|
+@app.route('/search', methods=['POST'])
|
|
|
+def search():
|
|
|
+ try:
|
|
|
+
|
|
|
+ key = request.headers.get('Authorization')
|
|
|
+ if key != API_KEY:
|
|
|
+ return jsonify({'error': 'Invalid API key'}), 403
|
|
|
+
|
|
|
+ data = request.json
|
|
|
+ word = data.get('word')
|
|
|
+ num = data.get('num', 10)
|
|
|
+ add = data.get('add', False)
|
|
|
+
|
|
|
+ if not word:
|
|
|
+ return jsonify({'error': 'Missing "word" parameter'}), 400
|
|
|
+
|
|
|
+ if add:
|
|
|
+ result = asyncio.run(cr4_add_search(word))
|
|
|
+ else:
|
|
|
+ result = asyncio.run(cr4_search(word, num))
|
|
|
+
|
|
|
+ print("------------search_result-------------")
|
|
|
+ print(result)
|
|
|
+
|
|
|
+ if isinstance(result, dict):
|
|
|
+ return jsonify(result)
|
|
|
+ else:
|
|
|
+ return jsonify({'data': result})
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"处理请求时出错: {e}")
|
|
|
+ return jsonify({'error': 'Internal server error'}), 500
|
|
|
+
|
|
|
+
|
|
|
+@app.route('/duck', methods=['POST'])
|
|
|
+def searchInDuck():
|
|
|
+ try:
|
|
|
+
|
|
|
+ key = request.headers.get('Authorization')
|
|
|
+ if key != API_KEY:
|
|
|
+ return jsonify({'error': 'Invalid API key'}), 403
|
|
|
+
|
|
|
+ data = request.json
|
|
|
+ word = data.get('word')
|
|
|
+
|
|
|
+ if not word:
|
|
|
+ return jsonify({'error': 'Missing "word" parameter'}), 400
|
|
|
+
|
|
|
+ result = asyncio.run(search_in_duck(word))
|
|
|
+
|
|
|
+ print("------------search_result-------------")
|
|
|
+ print(result)
|
|
|
+
|
|
|
+ if isinstance(result, dict):
|
|
|
+ return jsonify(result)
|
|
|
+ else:
|
|
|
+ return jsonify({'data': result})
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"处理请求时出错: {e}")
|
|
|
+ return jsonify({'error': f'Internal server error:{e}'}), 500
|
|
|
+
|
|
|
+@app.route('/read_url', methods=['GET'])
|
|
|
+def read_url():
|
|
|
+ try:
|
|
|
+ url = request.args.get('url')
|
|
|
+ return jsonify({'data': url_read(url)})
|
|
|
+ except Exception as e:
|
|
|
+ print(f"处理请求时出错: {e}")
|
|
|
+ return jsonify({'error': f'Internal server error:{e}'}), 500
|
|
|
+
|
|
|
+@app.teardown_appcontext
|
|
|
+def cleanup(res_or_exc):
|
|
|
+ cleanup_temp_dir()
|
|
|
+ return res_or_exc
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ app.run(host='0.0.0.0', port=6218)
|