import os import asyncio import re import shutil from flask import Flask, request, jsonify from crawl4ai import AsyncWebCrawler from crawl4ai.async_configs import BrowserConfig import sys import io from search import url_read # 设置标准输出的编码为UTF-8 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') # 设置临时目录环境变量 TEMP_DIR = os.path.join(os.getcwd(), "playwright_temp") os.environ['PLAYWRIGHT_TEMP_DIR'] = TEMP_DIR # 确保临时目录存在 os.makedirs(TEMP_DIR, exist_ok=True) app = Flask(__name__) API_KEY = "gxx12138_test" browser_config = BrowserConfig(proxy="socks5://127.0.0.1:20170") # 工具函数:清理临时目录 def cleanup_temp_dir(): try: if os.path.exists(TEMP_DIR): shutil.rmtree(TEMP_DIR) os.makedirs(TEMP_DIR) return True except Exception as e: print(f"清理临时目录失败: {e}") return False # 网页内容读取函数,用来爬网站文本内容 async def cr4_add_search(key_word): async with AsyncWebCrawler(config=browser_config) as crawler: try: result = await crawler.arun( url=f"https://duckduckgo.com/?q=托拉姆+{key_word}", ) text = str(result.markdown) pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)') matches = pattern.findall(text) search_results = [] for title, url in matches: url_str = str(url).lower() title_str = str(title) if 'taptap' in url_str or 'duck' in url_str or title_str.startswith("http"): continue truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str search_results.append({ 'url': url, 'title': truncated_title }) return {'data': search_results} finally: cleanup_temp_dir() async def cr4_search(ask, max_results): async with AsyncWebCrawler() as crawler: try: result = await crawler.arun( url=f"https://www.taptap.cn/app/42890/topic/search/{ask}", ) text = str(result.markdown) moment_urls = re.findall(r'(https?://www.taptap.cn/moment/\d+)', text) unique_urls = list(set(moment_urls)) return "\n".join(unique_urls[:max_results]) finally: cleanup_temp_dir() async def search_in_duck(key_word): async with AsyncWebCrawler(config=browser_config) as crawler: try: result = await crawler.arun( url=f"https://duckduckgo.com/?t=ffab&q={key_word}&ia=web", ) text = str(result.markdown) pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)') matches = pattern.findall(text) search_results = [] for title, url in matches: url_str = str(url).lower() title_str = str(title) if 'duck' in url_str or title_str.startswith("http"): continue truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str search_results.append({ 'url': url, 'title': truncated_title }) return {'data': search_results} finally: cleanup_temp_dir() @app.route('/search', methods=['POST']) def search(): try: key = request.headers.get('Authorization') if key != API_KEY: return jsonify({'error': 'Invalid API key'}), 403 data = request.json word = data.get('word') num = data.get('num', 10) add = data.get('add', False) if not word: return jsonify({'error': 'Missing "word" parameter'}), 400 if add: result = asyncio.run(cr4_add_search(word)) else: result = asyncio.run(cr4_search(word, num)) print("------------search_result-------------") print(result) if isinstance(result, dict): return jsonify(result) else: return jsonify({'data': result}) except Exception as e: print(f"处理请求时出错: {e}") return jsonify({'error': 'Internal server error'}), 500 @app.route('/duck', methods=['POST']) def searchInDuck(): try: key = request.headers.get('Authorization') if key != API_KEY: return jsonify({'error': 'Invalid API key'}), 403 data = request.json word = data.get('word') if not word: return jsonify({'error': 'Missing "word" parameter'}), 400 result = asyncio.run(search_in_duck(word)) print("------------search_result-------------") print(result) if isinstance(result, dict): return jsonify(result) else: return jsonify({'data': result}) except Exception as e: print(f"处理请求时出错: {e}") return jsonify({'error': f'Internal server error:{e}'}), 500 @app.route('/read_url', methods=['GET']) def read_url(): try: url = request.args.get('url') return jsonify({'data': url_read(url)}) except Exception as e: print(f"处理请求时出错: {e}") return jsonify({'error': f'Internal server error:{e}'}), 500 @app.teardown_appcontext def cleanup(res_or_exc): cleanup_temp_dir() return res_or_exc if __name__ == '__main__': app.run(host='0.0.0.0', port=6218)