Gxx
/
craw


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
							import os
import asyncio
import re
import shutil
from flask import Flask, request, jsonify
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig
import sys
import io
from search import url_read

# 设置标准输出的编码为UTF-8
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
# 设置临时目录环境变量
TEMP_DIR = os.path.join(os.getcwd(), "playwright_temp")
os.environ['PLAYWRIGHT_TEMP_DIR'] = TEMP_DIR

# 确保临时目录存在
os.makedirs(TEMP_DIR, exist_ok=True)

app = Flask(__name__)
API_KEY = "gxx12138_test"

browser_config = BrowserConfig(proxy="socks5://127.0.0.1:20170")
# 工具函数：清理临时目录
def cleanup_temp_dir():
    try:
        if os.path.exists(TEMP_DIR):
            shutil.rmtree(TEMP_DIR)
            os.makedirs(TEMP_DIR)
        return True
    except Exception as e:
        print(f"清理临时目录失败: {e}")
        return False

# 网页内容读取函数，用来爬网站文本内容

async def cr4_add_search(key_word):
    async with AsyncWebCrawler(config=browser_config) as crawler:
        try:
            result = await crawler.arun(
                url=f"https://duckduckgo.com/?q=托拉姆+{key_word}",
            )

            text = str(result.markdown)
            pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
            matches = pattern.findall(text)

            search_results = []
            for title, url in matches:
                url_str = str(url).lower()
                title_str = str(title)
                if 'taptap' in url_str or 'duck' in url_str or title_str.startswith("http"):
                    continue

                truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
                search_results.append({
                    'url': url,
                    'title': truncated_title
                })

            return {'data': search_results}
        finally:
            cleanup_temp_dir()

async def cr4_search(ask, max_results):
    async with AsyncWebCrawler() as crawler:
        try:
            result = await crawler.arun(
                url=f"https://www.taptap.cn/app/42890/topic/search/{ask}",
            )

            text = str(result.markdown)
            moment_urls = re.findall(r'(https?://www.taptap.cn/moment/\d+)', text)
            unique_urls = list(set(moment_urls))
            return "\n".join(unique_urls[:max_results])
        finally:
            cleanup_temp_dir()

async def search_in_duck(key_word):
    async with AsyncWebCrawler(config=browser_config) as crawler:
        try:
            result = await crawler.arun(

                url=f"https://duckduckgo.com/?t=ffab&q={key_word}&ia=web",
            )

            text = str(result.markdown)
            pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
            matches = pattern.findall(text)

            search_results = []
            for title, url in matches:
                url_str = str(url).lower()
                title_str = str(title)
                if 'duck' in url_str or title_str.startswith("http"):
                    continue

                truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
                search_results.append({
                    'url': url,
                    'title': truncated_title
                })

            return {'data': search_results}
        finally:
            cleanup_temp_dir()

@app.route('/search', methods=['POST'])
def search():
    try:

        key = request.headers.get('Authorization')
        if key != API_KEY:
            return jsonify({'error': 'Invalid API key'}), 403

        data = request.json
        word = data.get('word')
        num = data.get('num', 10)
        add = data.get('add', False)

        if not word:
            return jsonify({'error': 'Missing "word" parameter'}), 400

        if add:
            result = asyncio.run(cr4_add_search(word))
        else:
            result = asyncio.run(cr4_search(word, num))

        print("------------search_result-------------")
        print(result)

        if isinstance(result, dict):
            return jsonify(result)
        else:
            return jsonify({'data': result})

    except Exception as e:
        print(f"处理请求时出错: {e}")
        return jsonify({'error': 'Internal server error'}), 500


@app.route('/duck', methods=['POST'])
def searchInDuck():
    try:

        key = request.headers.get('Authorization')
        if key != API_KEY:
            return jsonify({'error': 'Invalid API key'}), 403

        data = request.json
        word = data.get('word')

        if not word:
            return jsonify({'error': 'Missing "word" parameter'}), 400

        result = asyncio.run(search_in_duck(word))

        print("------------search_result-------------")
        print(result)

        if isinstance(result, dict):
            return jsonify(result)
        else:
            return jsonify({'data': result})

    except Exception as e:
        print(f"处理请求时出错: {e}")
        return jsonify({'error': f'Internal server error:{e}'}), 500

@app.route('/read_url', methods=['GET'])
def read_url():
    try:
        url = request.args.get('url')
        return jsonify({'data': url_read(url)})
    except Exception as e:
        print(f"处理请求时出错: {e}")
        return jsonify({'error': f'Internal server error:{e}'}), 500

@app.teardown_appcontext
def cleanup(res_or_exc):
    cleanup_temp_dir()
    return res_or_exc


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=6218)