| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188 |
- import os
- import asyncio
- import re
- import shutil
- from flask import Flask, request, jsonify
- from crawl4ai import AsyncWebCrawler
- from crawl4ai.async_configs import BrowserConfig
- import sys
- import io
- from search import url_read
- # 设置标准输出的编码为UTF-8
- sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
- sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
- # 设置临时目录环境变量
- TEMP_DIR = os.path.join(os.getcwd(), "playwright_temp")
- os.environ['PLAYWRIGHT_TEMP_DIR'] = TEMP_DIR
- # 确保临时目录存在
- os.makedirs(TEMP_DIR, exist_ok=True)
- app = Flask(__name__)
- API_KEY = "gxx12138_test"
- browser_config = BrowserConfig(proxy="socks5://127.0.0.1:20170")
- # 工具函数:清理临时目录
- def cleanup_temp_dir():
- try:
- if os.path.exists(TEMP_DIR):
- shutil.rmtree(TEMP_DIR)
- os.makedirs(TEMP_DIR)
- return True
- except Exception as e:
- print(f"清理临时目录失败: {e}")
- return False
- # 网页内容读取函数,用来爬网站文本内容
- async def cr4_add_search(key_word):
- async with AsyncWebCrawler(config=browser_config) as crawler:
- try:
- result = await crawler.arun(
- url=f"https://duckduckgo.com/?q=托拉姆+{key_word}",
- )
- text = str(result.markdown)
- pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
- matches = pattern.findall(text)
- search_results = []
- for title, url in matches:
- url_str = str(url).lower()
- title_str = str(title)
- if 'taptap' in url_str or 'duck' in url_str or title_str.startswith("http"):
- continue
- truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
- search_results.append({
- 'url': url,
- 'title': truncated_title
- })
- return {'data': search_results}
- finally:
- cleanup_temp_dir()
- async def cr4_search(ask, max_results):
- async with AsyncWebCrawler() as crawler:
- try:
- result = await crawler.arun(
- url=f"https://www.taptap.cn/app/42890/topic/search/{ask}",
- )
- text = str(result.markdown)
- moment_urls = re.findall(r'(https?://www.taptap.cn/moment/\d+)', text)
- unique_urls = list(set(moment_urls))
- return "\n".join(unique_urls[:max_results])
- finally:
- cleanup_temp_dir()
- async def search_in_duck(key_word):
- async with AsyncWebCrawler(config=browser_config) as crawler:
- try:
- result = await crawler.arun(
- url=f"https://duckduckgo.com/?t=ffab&q={key_word}&ia=web",
- )
- text = str(result.markdown)
- pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
- matches = pattern.findall(text)
- search_results = []
- for title, url in matches:
- url_str = str(url).lower()
- title_str = str(title)
- if 'duck' in url_str or title_str.startswith("http"):
- continue
- truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
- search_results.append({
- 'url': url,
- 'title': truncated_title
- })
- return {'data': search_results}
- finally:
- cleanup_temp_dir()
- @app.route('/search', methods=['POST'])
- def search():
- try:
- key = request.headers.get('Authorization')
- if key != API_KEY:
- return jsonify({'error': 'Invalid API key'}), 403
- data = request.json
- word = data.get('word')
- num = data.get('num', 10)
- add = data.get('add', False)
- if not word:
- return jsonify({'error': 'Missing "word" parameter'}), 400
- if add:
- result = asyncio.run(cr4_add_search(word))
- else:
- result = asyncio.run(cr4_search(word, num))
- print("------------search_result-------------")
- print(result)
- if isinstance(result, dict):
- return jsonify(result)
- else:
- return jsonify({'data': result})
- except Exception as e:
- print(f"处理请求时出错: {e}")
- return jsonify({'error': 'Internal server error'}), 500
- @app.route('/duck', methods=['POST'])
- def searchInDuck():
- try:
- key = request.headers.get('Authorization')
- if key != API_KEY:
- return jsonify({'error': 'Invalid API key'}), 403
- data = request.json
- word = data.get('word')
- if not word:
- return jsonify({'error': 'Missing "word" parameter'}), 400
- result = asyncio.run(search_in_duck(word))
- print("------------search_result-------------")
- print(result)
- if isinstance(result, dict):
- return jsonify(result)
- else:
- return jsonify({'data': result})
- except Exception as e:
- print(f"处理请求时出错: {e}")
- return jsonify({'error': f'Internal server error:{e}'}), 500
- @app.route('/read_url', methods=['GET'])
- def read_url():
- try:
- url = request.args.get('url')
- return jsonify({'data': url_read(url)})
- except Exception as e:
- print(f"处理请求时出错: {e}")
- return jsonify({'error': f'Internal server error:{e}'}), 500
- @app.teardown_appcontext
- def cleanup(res_or_exc):
- cleanup_temp_dir()
- return res_or_exc
- if __name__ == '__main__':
- app.run(host='0.0.0.0', port=6218)
|