关习习 hace 3 días
commit
aa7481019a
Se han modificado 3 ficheros con 276 adiciones y 0 borrados
  1. 188 0
      main.py
  2. 3 0
      requirements.txt
  3. 85 0
      search.py

+ 188 - 0
main.py

@@ -0,0 +1,188 @@
+import os
+import asyncio
+import re
+import shutil
+from flask import Flask, request, jsonify
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig
+import sys
+import io
+from search import url_read
+
+# 设置标准输出的编码为UTF-8
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+# 设置临时目录环境变量
+TEMP_DIR = os.path.join(os.getcwd(), "playwright_temp")
+os.environ['PLAYWRIGHT_TEMP_DIR'] = TEMP_DIR
+
+# 确保临时目录存在
+os.makedirs(TEMP_DIR, exist_ok=True)
+
+app = Flask(__name__)
+API_KEY = "gxx12138_test"
+
+browser_config = BrowserConfig(proxy="socks5://127.0.0.1:20170")
+# 工具函数:清理临时目录
+def cleanup_temp_dir():
+    try:
+        if os.path.exists(TEMP_DIR):
+            shutil.rmtree(TEMP_DIR)
+            os.makedirs(TEMP_DIR)
+        return True
+    except Exception as e:
+        print(f"清理临时目录失败: {e}")
+        return False
+
+# 网页内容读取函数,用来爬网站文本内容
+
+async def cr4_add_search(key_word):
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        try:
+            result = await crawler.arun(
+                url=f"https://duckduckgo.com/?q=托拉姆+{key_word}",
+            )
+
+            text = str(result.markdown)
+            pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
+            matches = pattern.findall(text)
+
+            search_results = []
+            for title, url in matches:
+                url_str = str(url).lower()
+                title_str = str(title)
+                if 'taptap' in url_str or 'duck' in url_str or title_str.startswith("http"):
+                    continue
+
+                truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
+                search_results.append({
+                    'url': url,
+                    'title': truncated_title
+                })
+
+            return {'data': search_results}
+        finally:
+            cleanup_temp_dir()
+
+async def cr4_search(ask, max_results):
+    async with AsyncWebCrawler() as crawler:
+        try:
+            result = await crawler.arun(
+                url=f"https://www.taptap.cn/app/42890/topic/search/{ask}",
+            )
+
+            text = str(result.markdown)
+            moment_urls = re.findall(r'(https?://www.taptap.cn/moment/\d+)', text)
+            unique_urls = list(set(moment_urls))
+            return "\n".join(unique_urls[:max_results])
+        finally:
+            cleanup_temp_dir()
+
+async def search_in_duck(key_word):
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        try:
+            result = await crawler.arun(
+
+                url=f"https://duckduckgo.com/?t=ffab&q={key_word}&ia=web",
+            )
+
+            text = str(result.markdown)
+            pattern = re.compile(r'\[([^]]+)\]\((https?://[^)]+)\)')
+            matches = pattern.findall(text)
+
+            search_results = []
+            for title, url in matches:
+                url_str = str(url).lower()
+                title_str = str(title)
+                if 'duck' in url_str or title_str.startswith("http"):
+                    continue
+
+                truncated_title = title_str[:18] + '...' if len(title_str) > 18 else title_str
+                search_results.append({
+                    'url': url,
+                    'title': truncated_title
+                })
+
+            return {'data': search_results}
+        finally:
+            cleanup_temp_dir()
+
+@app.route('/search', methods=['POST'])
+def search():
+    try:
+
+        key = request.headers.get('Authorization')
+        if key != API_KEY:
+            return jsonify({'error': 'Invalid API key'}), 403
+
+        data = request.json
+        word = data.get('word')
+        num = data.get('num', 10)
+        add = data.get('add', False)
+
+        if not word:
+            return jsonify({'error': 'Missing "word" parameter'}), 400
+
+        if add:
+            result = asyncio.run(cr4_add_search(word))
+        else:
+            result = asyncio.run(cr4_search(word, num))
+
+        print("------------search_result-------------")
+        print(result)
+
+        if isinstance(result, dict):
+            return jsonify(result)
+        else:
+            return jsonify({'data': result})
+
+    except Exception as e:
+        print(f"处理请求时出错: {e}")
+        return jsonify({'error': 'Internal server error'}), 500
+
+
+@app.route('/duck', methods=['POST'])
+def searchInDuck():
+    try:
+
+        key = request.headers.get('Authorization')
+        if key != API_KEY:
+            return jsonify({'error': 'Invalid API key'}), 403
+
+        data = request.json
+        word = data.get('word')
+
+        if not word:
+            return jsonify({'error': 'Missing "word" parameter'}), 400
+
+        result = asyncio.run(search_in_duck(word))
+
+        print("------------search_result-------------")
+        print(result)
+
+        if isinstance(result, dict):
+            return jsonify(result)
+        else:
+            return jsonify({'data': result})
+
+    except Exception as e:
+        print(f"处理请求时出错: {e}")
+        return jsonify({'error': f'Internal server error:{e}'}), 500
+
+@app.route('/read_url', methods=['GET'])
+def read_url():
+    try:
+        url = request.args.get('url')
+        return jsonify({'data': url_read(url)})
+    except Exception as e:
+        print(f"处理请求时出错: {e}")
+        return jsonify({'error': f'Internal server error:{e}'}), 500
+
+@app.teardown_appcontext
+def cleanup(res_or_exc):
+    cleanup_temp_dir()
+    return res_or_exc
+
+
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=6218)

+ 3 - 0
requirements.txt

@@ -0,0 +1,3 @@
+flask
+crawl4ai
+trafilatura

+ 85 - 0
search.py

@@ -0,0 +1,85 @@
+import requests
+import trafilatura
+def url_read(url):
+    try:
+        # 用 requests 控制超时,更稳定
+        response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
+        response.raise_for_status()
+        html = response.text
+
+        # 提取正文
+        text = trafilatura.extract(html, favor_precision=True)
+        return text if text else "无法提取网页内容"
+
+    except Exception as e:
+        print(f"抓取失败: {url}, 错误: {str(e)}")
+        return ""
+
+def searxng_search(key_word, page=1):
+    # 配置参数
+    base_url = "http://10.144.144.2:8888/search"
+    params = {
+        "q": key_word,  # 搜索关键词
+        "format": "json",  # 返回 JSON 格式
+        "pageno": page,
+        "engines": "360search"
+    }
+    docs = []
+
+    try:
+        # 发送 GET 请求
+        response = requests.post(base_url, params=params, timeout=10)
+        response.raise_for_status()  # 检查 HTTP 错误状态码
+
+        # 解析 JSON 响应
+        data = response.json()
+
+        # 提取搜索结果(以网页结果为例)
+
+        results = data.get("results", [])
+        for idx, result in enumerate(results, 1):
+            url = result.get('url', '')
+            docs.append(url)
+            print(f"标题:{result.get('title', '')}")
+
+    except requests.exceptions.RequestException as e:
+        print(f"请求失败:{e}")
+    except ValueError as e:
+        print(f"解析 JSON 失败:{e}")
+
+    return docs
+
+def search(key_word, max_num = 5, only_tap = 1):
+    key_word = "托拉姆物语" + key_word
+    docs = []
+    url_count = 0
+    now_page = 1
+    while url_count < max_num:
+        urls = searxng_search(key_word, now_page)
+        now_page += 1
+        for url in urls:
+            if only_tap == 1:
+                if ("taptap" in url) and (url not in docs):
+                    docs.append(url)
+                    url_count += 1
+            else:
+                if url not in docs:
+                    docs.append(url)
+                    url_count += 1
+            if url_count >= max_num:
+                break
+    return docs
+
+def read_all_urls(urls):
+    ans = ""
+    for url in urls:
+        data = url_read(url)
+        ans += f"网页地址:{url}读取到的内容如下:\n" + data
+        ans += "\n"
+    return ans
+
+if __name__ == "__main__":
+    urls = search("托拉姆物语+压血魔导", 20, 1)
+    print(urls)
+    data = url_read(urls[0])
+    print(data)