Merge "11"

commit: c60688d6eefe9ea5322a6807a13d1c06a016f151 [log] [tgz]
author: 956303669 <956303669@qq.com> Sat Jun 14 22:03:10 2025 +0800
committer: Gerrit Code Review <root@debian> Sat Jun 14 22:03:10 2025 +0800
tree: dd00fed24e80043c974853ec575157ffb00f808c
parent: cae762d729ecc9fc3f30c26cfde42bd4b06bb5c4 [diff]
parent: a520ffd4934743d11f24891080107b6de9b08633 [diff]
diff --git a/JWLLL/main_online.py b/JWLLL/main_online.py
new file mode 100644
index 0000000..0c2dd7b
--- /dev/null
+++ b/JWLLL/main_online.py

@@ -0,0 +1,1017 @@
+# main_online.py
+# 搜索推荐算法服务的主入口
+
+import json
+import numpy as np
+import difflib
+from flask import Flask, request, jsonify, Response
+import pymysql
+import jieba
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import pypinyin
+from flask_cors import CORS
+import re
+import Levenshtein
+import os
+import logging
+
+# 设置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger("allpt-search")
+
+# 导入Word2Vec辅助模块
+try:
+    from word2vec_helper import get_word2vec_helper, expand_query, get_similar_words
+    WORD2VEC_ENABLED = True
+    logger.info("Word2Vec模块已加载")
+except ImportError as e:
+    logger.warning(f"Word2Vec模块加载失败: {e}，将使用传统搜索")
+    WORD2VEC_ENABLED = False
+
+# 数据库配置
+DB_CONFIG = {
+    "host": "10.126.59.25",
+    "port": 3306,
+    "user": "root",
+    "password": "123456",
+    "database": "redbook",
+    "charset": "utf8mb4"
+}
+
+def get_db_conn():
+    return pymysql.connect(**DB_CONFIG)
+
+def get_pinyin(text):
+    # 返回字符串的全拼音（不带声调，全部小写），支持英文直接返回
+    if not text:
+        return ""
+    import re
+    # 如果全是英文，直接返回小写
+    if re.fullmatch(r'[a-zA-Z]+', text):
+        return text.lower()
+    return ''.join([p[0] for p in pypinyin.pinyin(text, style=pypinyin.NORMAL)])
+
+def get_pinyin_initials(text):
+    # 返回字符串的首字母拼音（全部小写），支持英文直接返回
+    if not text:
+        return ""
+    import re
+    if re.fullmatch(r'[a-zA-Z]+', text):
+        return text.lower()
+    return ''.join([p[0][0] for p in pypinyin.pinyin(text, style=pypinyin.NORMAL)])
+
+# 新增词语相似度计算函数
+def word_similarity(word1, word2):
+    """计算两个词的相似度，支持拼音匹配"""
+    # 直接匹配
+    if word1 == word2:
+        return 1.0
+    
+    # 拼音匹配
+    if get_pinyin(word1) == get_pinyin(word2):
+        return 0.9
+    
+    # 拼音首字母匹配
+    if get_pinyin_initials(word1) == get_pinyin_initials(word2):
+        return 0.7
+    
+    # 字符串相似度
+    return difflib.SequenceMatcher(None, word1, word2).ratio()
+
+def semantic_title_similarity(query, title):
+    """计算查询词与标题的语义相似度"""
+    # 分词
+    query_words = list(jieba.cut(query))
+    title_words = list(jieba.cut(title))
+    
+    if not query_words or not title_words:
+        return 0.0
+    
+    # 计算每个查询词与标题词的最大相似度
+    max_similarities = []
+    key_matches = 0  # 关键词精确匹配数量
+    
+    for q_word in query_words:
+        if len(q_word.strip()) <= 1:  # 忽略单字，减少噪音
+            continue
+            
+        word_sims = [word_similarity(q_word, t_word) for t_word in title_words]
+        if word_sims:
+            max_sim = max(word_sims)
+            max_similarities.append(max_sim)
+            if max_sim > 0.85:  # 认为是关键词匹配
+                key_matches += 1
+    
+    if not max_similarities:
+        return 0.0
+    
+    # 计算平均相似度
+    avg_sim = sum(max_similarities) / len(max_similarities)
+    
+    # 权重计算: 平均相似度占70%，关键词匹配率占30%
+    key_match_ratio = key_matches / len(query_words) if query_words else 0
+    
+    # 标题中包含完整查询短语时给予额外加分
+    exact_bonus = 0.3 if query in title else 0
+    
+    return 0.7 * avg_sim + 0.3 * key_match_ratio + exact_bonus
+
+# 添加语义关联词典，用于增强搜索能力
+def load_semantic_mappings():
+    """
+    加载语义关联映射表，用于增强搜索语义理解
+    返回包含语义映射关系的字典
+    """
+    # 初始化空字典，所有映射将从配置文件加载
+    mappings = {}
+    
+    # 从配置文件加载映射
+    try:
+        config_path = os.path.join(os.path.dirname(__file__), "semantic_config.json")
+        if os.path.exists(config_path):
+            with open(config_path, 'r', encoding='utf-8') as f:
+                mappings = json.load(f)
+            logger.info(f"已从配置文件加载 {len(mappings)} 个语义映射")
+        else:
+            logger.warning(f"语义配置文件不存在: {config_path}")
+    except Exception as e:
+        logger.error(f"加载语义配置文件失败: {e}")
+    
+    return mappings
+
+# 初始化语义映射
+SEMANTIC_MAPPINGS = load_semantic_mappings()
+
+def expand_search_keywords(keyword):
+    """
+    扩展搜索关键词，增加语义关联词
+    """
+    expanded = [keyword]
+    
+    # 分词处理
+    words = list(jieba.cut(keyword))
+    logger.info(f"关键词 '{keyword}' 分词结果: {words}")  # 记录分词结果
+    
+    # 分别对每个分词进行语义扩展
+    for word in words:
+        if word in SEMANTIC_MAPPINGS:
+            # 添加语义关联词
+            mapped_words = SEMANTIC_MAPPINGS[word]
+            expanded.extend(mapped_words)
+            logger.info(f"语义映射: '{word}' -> {mapped_words}")
+            
+            # 移除所有特殊处理部分
+            # 不再对任何特定关键词如"越狱"进行特殊处理
+    
+    # Word2Vec扩展 - 如果可用，对分词结果进行Word2Vec扩展
+    if WORD2VEC_ENABLED:
+        try:
+            # 使用单独的变量记录原始扩展结果，方便记录日志
+            original_expanded = set(expanded)
+            
+            # 首先尝试对整个关键词进行扩展
+            w2v_expanded = set()
+            similar_words = get_similar_words(keyword, topn=3, min_similarity=0.6)
+            w2v_expanded.update(similar_words)
+            
+            # 然后对较长的分词进行扩展
+            for word in words:
+                if len(word) > 1:  # 忽略单字
+                    similar_words = get_similar_words(word, topn=2, min_similarity=0.65)
+                    w2v_expanded.update(similar_words)
+            
+            # 合并结果
+            expanded.extend(w2v_expanded)
+            
+            # 记录日志
+            if w2v_expanded:
+                logger.info(f"Word2Vec扩展: {keyword} -> {list(w2v_expanded)}")
+        except Exception as e:
+            # 出错时记录但不中断搜索流程
+            logger.error(f"Word2Vec扩展失败: {e}")
+            logger.info("将仅使用配置文件中的语义映射")
+    
+    # 去重
+    return list(set(expanded))
+
+# 替换原有的calculate_keyword_relevance函数，采用更通用的相关性算法
+def calculate_keyword_relevance(keyword, item):
+    """计算搜索关键词与条目的相关性得分"""
+    title = item.get('title', '')
+    description = item.get('description', '') or ''
+    tags = item.get('tags', '') or ''
+    category = item.get('category', '') or ''  # 添加category字段
+    
+    # 初始化得分
+    score = 0
+    
+    # 1. 精确匹配（最高优先级）
+    if keyword.lower() == title.lower():
+        return 15.0  # 完全匹配给予最高分
+    
+    # 2. 标题中精确词匹配
+    title_words = re.findall(r'\b\w+\b', title.lower())
+    if keyword.lower() in title_words:
+        score += 10.0  # 作为独立词完全匹配
+    
+    # 3. 标题包含关键词（部分匹配）
+    elif keyword.lower() in title.lower():
+        # 计算关键词所占标题比例
+        match_ratio = len(keyword) / len(title)
+        if match_ratio > 0.5:  # 关键词占标题很大比例
+            score += 8.0
+        else:
+            score += 5.0
+    
+    # 4. 标题分词匹配
+    keyword_words = list(jieba.cut(keyword))
+    title_jieba_words = list(jieba.cut(title))
+    
+    matched_words = 0
+    for k_word in keyword_words:
+        if len(k_word) > 1:  # 忽略单字
+            if k_word in title_jieba_words:
+                matched_words += 1
+            else:
+                # 拼音匹配
+                k_pinyin = get_pinyin(k_word)
+                for t_word in title_jieba_words:
+                    if get_pinyin(t_word) == k_pinyin:
+                        matched_words += 0.8
+                        break
+    
+    if len(keyword_words) > 0:
+        word_match_ratio = matched_words / len(keyword_words)
+        score += 3.0 * word_match_ratio
+    
+    # 5. 拼音相似度
+    keyword_pinyin = get_pinyin(keyword)
+    title_pinyin = get_pinyin(title)
+    
+    if keyword_pinyin == title_pinyin:
+        score += 3.5
+    elif keyword_pinyin in title_pinyin:
+        # 计算拼音在标题中的位置影响
+        pos = title_pinyin.find(keyword_pinyin)
+        if pos == 0:  # 出现在开头
+            score += 3.0
+        else:
+            score += 2.0
+    
+    # 6. 编辑距离相似度
+    try:
+        edit_distance = Levenshtein.distance(keyword.lower(), title.lower())
+        max_len = max(len(keyword), len(title))
+        if max_len > 0:
+            similarity = 1 - (edit_distance / max_len)
+            if similarity > 0.7:
+                score += 1.5 * similarity
+    except:
+        similarity = difflib.SequenceMatcher(None, keyword.lower(), title.lower()).ratio()
+        if similarity > 0.7:
+            score += 1.5 * similarity
+    
+    # 7. 中文字符重叠检测 - 修改为仅当重叠2个以上汉字或占比超过40%时才计分
+    if re.search(r'[\u4e00-\u9fff]', keyword) and re.search(r'[\u4e00-\u9fff]', title):
+        cn_chars_keyword = set(re.findall(r'[\u4e00-\u9fff]', keyword))
+        cn_chars_title = set(re.findall(r'[\u4e00-\u9fff]', title))
+        
+        # 计算重叠的汉字集合
+        overlapped_chars = cn_chars_keyword & cn_chars_title
+        
+        # 仅当重叠汉字数量大于1且占比超过阈值时才计分
+        if len(overlapped_chars) > 1 and len(cn_chars_keyword) > 0:
+            overlap_ratio = len(overlapped_chars) / len(cn_chars_keyword)
+            # 增加重叠比例的阈值要求，防止单个汉字导致的误匹配
+            if overlap_ratio >= 0.4 or len(overlapped_chars) >= 3:
+                score += 2.0 * overlap_ratio
+            # 对于非常低的重叠度，不加分，避免无关内容干扰
+        
+        # 记录日志，帮助调试特定案例
+        if keyword == "明日方舟" and "白日梦想家" in title:
+            logger.info(f"'明日方舟'与'{title}'的汉字重叠: {overlapped_chars}, 重叠比例: {len(overlapped_chars)/len(cn_chars_keyword) if cn_chars_keyword else 0}")
+    
+    # 8. 序列资源检测（如"功夫熊猫2"是"功夫熊猫"的系列）
+    base_title_match = re.match(r'(.*?)([0-9]+|[一二三四五六七八九十]|：|\:|\s+[0-9]+)', title)
+    if base_title_match:
+        base_title = base_title_match.group(1).strip()
+        if keyword.lower() == base_title.lower():
+            score += 2.0
+    
+    # 9. 标签和描述匹配（增加权重）
+    if tags:
+        tags_list = tags.split(',')
+        if keyword in tags_list:
+            score += 1.5  # 提高标签匹配的权重
+        elif any(keyword.lower() in tag.lower() for tag in tags_list):
+            score += 1.0  # 提高部分匹配的权重
+    
+    # 描述匹配增强
+    if keyword.lower() in description.lower():
+        score += 1.5  # 提高描述匹配的权重
+        
+        # 检查关键词在描述中的位置和上下文
+        pos = description.lower().find(keyword.lower())
+        if pos >= 0 and pos < len(description) / 3:
+            # 关键词出现在描述前1/3部分，可能更重要
+            score += 0.5
+    
+    # 考虑分词匹配描述
+    keyword_words = list(jieba.cut(keyword))
+    description_words = list(jieba.cut(description))
+    matched_desc_words = 0
+    for k_word in keyword_words:
+        if len(k_word) > 1 and k_word in description_words:
+            matched_desc_words += 1
+    
+    if len(keyword_words) > 0:
+        desc_match_ratio = matched_desc_words / len(keyword_words)
+        score += 1.0 * desc_match_ratio
+    
+    # 分类匹配
+    if keyword.lower() in category.lower():
+        score += 1.0
+    
+    # 添加语义关联匹配得分
+    # 扩展关键词进行匹配
+    expanded_keywords = expand_search_keywords(keyword)
+    
+    # 检测标题是否包含语义相关词
+    for exp_keyword in expanded_keywords:
+        if exp_keyword != keyword and exp_keyword in title:  # 避免重复计算原关键词
+            # 根据关联词的匹配类型给予不同分数
+            if exp_keyword in ["国宝", "熊猫"] and "功夫熊猫" in title:
+                score += 3.0  # 高度相关的语义映射
+            elif exp_keyword in title:
+                score += 1.5  # 一般语义关联
+    
+    # 对于特殊组合查询，额外加分
+    if ("国宝" in keyword or "熊猫" in keyword) and "电影" in keyword and "功夫熊猫" in title:
+        score += 4.0  # 对"国宝电影"、"熊猫电影"搜"功夫熊猫"特别加分
+    
+    return score
+
+# 创建Flask应用
+app = Flask(__name__)
+CORS(app)  # 允许所有跨域请求
+
+# 添加init_word2vec函数
+def init_word2vec():
+    """初始化Word2Vec模型"""
+    try:
+        helper = get_word2vec_helper()
+        if helper.initialized:
+            logger.info(f"Word2Vec模型已成功加载，词汇量: {len(helper.model.index_to_key)}, 向量维度: {helper.model.vector_size}")
+        else:
+            if helper.load_model():
+                logger.info(f"Word2Vec模型加载成功，词汇量: {len(helper.model.index_to_key)}, 向量维度: {helper.model.vector_size}")
+            else:
+                logger.error("Word2Vec模型加载失败")
+    except Exception as e:
+        logger.error(f"初始化Word2Vec出错: {e}")
+
+# 新的初始化方式:
+def initialize_app():
+    """应用初始化函数，替代before_first_request装饰器"""
+    # 修正：使用正确的函数名
+    # 原代码: init_semantic_mapping()
+    # 修正为使用已定义的函数名
+    global SEMANTIC_MAPPINGS
+    SEMANTIC_MAPPINGS = load_semantic_mappings()  # 更新全局语义映射变量
+    
+    if WORD2VEC_ENABLED:
+        init_word2vec()  # 现在这个函数已经定义了
+
+# 在启动应用之前调用初始化函数
+initialize_app()
+
+# 搜索功能的API
+@app.route('/search', methods=['POST'])
+def search():
+    """
+    搜索功能API
+    请求格式：{
+        "keyword": "关键词",
+        "sort_by": "downloads" | "downloads_asc" | "newest" | "oldest" | "similarity" | "title_asc" | "title_desc",
+        "category": "可选，分类名",
+        "search_mode": "title" | "title_desc" | "tags" | "all"  # 可选，默认"title",
+        "tags": ["标签1", "标签2"]  # 可选，支持传递多个标签
+    }
+    """
+    if request.content_type != 'application/json':
+        return jsonify({"error": "Content-Type must be application/json"}), 415
+
+    data = request.get_json()
+    keyword = data.get("keyword", "").strip()
+    sort_by = data.get("sort_by", "similarity")  # 默认按相似度排序
+    category = data.get("category", None)
+    search_mode = data.get("search_mode", "title")
+    tags = data.get("tags", None)  # 支持传递多个标签
+
+    # 校验参数 - 不管什么模式都要求关键词
+    if not (1 <= len(keyword) <= 20):
+        return jsonify({"error": "请输入1-20个字符"}), 400
+
+    # 第一阶段：数据库查询获取候选集
+    results = []
+    conn = get_db_conn()
+    try:
+        with conn.cursor(pymysql.cursors.DictCursor) as cursor:
+            # 首先尝试查询完全匹配的结果
+            exact_query = f"""
+                SELECT id, title, topic_id, heat, created_at, content
+                FROM posts
+                WHERE title = %s
+            """
+            cursor.execute(exact_query, (keyword,))
+            exact_matches = cursor.fetchall() or []  # 确保返回列表而非元组
+            
+            # 扩展关键词，增加语义关联词
+            expanded_keywords = expand_search_keywords(keyword)
+            logger.info(f"扩展后的关键词: {expanded_keywords}")  # 调试信息
+            
+            # 构建查询条件
+            conditions = []
+            params = []
+            
+            # 标题匹配 - 所有搜索模式都匹配title
+            conditions.append("title LIKE %s")
+            params.append(f"%{keyword}%")
+            
+            # 为扩展关键词添加标题匹配条件
+            for exp_keyword in expanded_keywords:
+                if exp_keyword != keyword:  # 避免重复原关键词
+                    conditions.append("title LIKE %s")
+                    params.append(f"%{exp_keyword}%")
+            
+            # 描述匹配
+            if search_mode in ["title_desc", "all"]:
+                # 原始关键词匹配描述
+                conditions.append("content LIKE %s")
+                params.append(f"%{keyword}%")
+                
+                # 扩展关键词匹配描述
+                for exp_keyword in expanded_keywords:
+                    if exp_keyword != keyword:
+                        conditions.append("content LIKE %s")
+                        params.append(f"%{exp_keyword}%")
+            
+            # 标签匹配
+            # 暂不处理，后续join实现
+            
+            # 分类匹配 - 仅在all模式下
+            if search_mode == "all":
+                # 原始关键词匹配分类
+                conditions.append("topic_id LIKE %s")
+                params.append(f"%{keyword}%")
+                
+                # 扩展关键词匹配分类
+                for exp_keyword in expanded_keywords:
+                    if exp_keyword != keyword:
+                        conditions.append("topic_id LIKE %s")
+                        params.append(f"%{exp_keyword}%")
+            
+            # 构建SQL查询
+            if conditions:
+                where_clause = " OR ".join(conditions)
+                logger.info(f"搜索条件: {where_clause}")
+                logger.info(f"参数列表: {params}")
+                
+                if category:
+                    where_clause = f"({where_clause}) AND topic_id=%s"
+                    params.append(category)
+                
+                sql = f"""
+                    SELECT p.id, p.title, tp.name as category, p.heat, p.created_at, p.content,
+                        GROUP_CONCAT(t.name) as tags
+                    FROM posts p
+                    LEFT JOIN post_tags pt ON p.id = pt.post_id
+                    LEFT JOIN tags t ON pt.tag_id = t.id
+                    LEFT JOIN topics tp ON p.topic_id = tp.id
+                    WHERE {where_clause}
+                    GROUP BY p.id
+                    LIMIT 500
+                """
+                
+                cursor.execute(sql, params)
+                expanded_results = cursor.fetchall()
+                logger.info(f"数据库返回记录数: {len(expanded_results) if expanded_results else 0}")
+            else:
+                expanded_results = []
+
+            # 如果扩展查询和精确匹配都没有结果，获取全部记录进行相关性计算
+            if not expanded_results and not exact_matches:
+                sql = "SELECT p.id, p.title, tp.name as category, p.heat, p.created_at, p.content, GROUP_CONCAT(t.name) as tags FROM posts p LEFT JOIN post_tags pt ON p.id = pt.post_id LEFT JOIN tags t ON pt.tag_id = t.id LEFT JOIN topics tp ON p.topic_id = tp.id"
+                if category:
+                    sql += " WHERE p.topic_id=%s"
+                    category_params = [category]
+                    cursor.execute(sql + " GROUP BY p.id", category_params)
+                else:
+                    cursor.execute(sql + " GROUP BY p.id")
+                
+                all_results = cursor.fetchall() or []  # 确保返回列表
+            else:
+                if isinstance(exact_matches, tuple):
+                    exact_matches = list(exact_matches)
+                if isinstance(expanded_results, tuple):
+                    expanded_results = list(expanded_results)
+                all_results = expanded_results + exact_matches
+            
+            # 对所有结果使用相关性计算规则
+            scored_results = []
+            for item in all_results:
+                # 计算相关性得分
+                relevance_score = calculate_keyword_relevance(keyword, item)
+                
+                # 降低相关性阈值，确保更多结果被保留 (从0.5改为0.1)
+                if relevance_score > 0.1:
+                    item['relevance_score'] = relevance_score
+                    scored_results.append(item)
+                    logger.info(f"匹配项: {item['title']}, 相关性得分: {relevance_score}")
+            
+            # 按相关性得分排序
+            scored_results.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
+            
+            # 确保精确匹配的结果置顶
+            if exact_matches:
+                for exact_match in exact_matches:
+                    exact_match['relevance_score'] = 20.0  # 超高分确保置顶
+                
+                # 移除scored_results中已经存在于exact_matches的项
+                exact_ids = {item['id'] for item in exact_matches}
+                scored_results = [item for item in scored_results if item['id'] not in exact_ids]
+                
+                # 合并两个结果集
+                results = exact_matches + scored_results
+            else:
+                results = scored_results
+            
+            # 限制返回结果数量
+            results = results[:50]
+            
+    except Exception as e:
+        logger.error(f"搜索出错: {e}")
+        import traceback
+        traceback.print_exc()
+        return jsonify({"error": "搜索系统异常，请稍后再试"}), 500
+    finally:
+        conn.close()
+    
+    # 第二阶段：根据指定方式排序
+    if results:
+        if sort_by == "similarity" or not sort_by:
+            # 保持按相关性得分排序，已经排好了
+            pass
+        elif sort_by == "downloads":
+            results.sort(key=lambda x: x.get("download_count", 0), reverse=True)
+        elif sort_by == "downloads_asc":
+            results.sort(key=lambda x: x.get("download_count", 0))
+        elif sort_by == "newest":
+            results.sort(key=lambda x: x.get("create_time", ""), reverse=True)
+        elif sort_by == "oldest":
+            results.sort(key=lambda x: x.get("create_time", ""))
+        elif sort_by == "title_asc":
+            results.sort(key=lambda x: x.get("title", ""))
+        elif sort_by == "title_desc":
+            results.sort(key=lambda x: x.get("title", ""), reverse=True)
+    
+    # 最终处理：清理不需要返回的字段
+    for item in results:
+        item.pop("description", None)
+        item.pop("tags", None)
+        item.pop("relevance_score", None)
+
+    return Response(json.dumps({"results": results}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+
+# 推荐功能的API
+@app.route('/recommend_tags', methods=['POST'])
+def recommend_tags():
+    """
+    推荐功能API
+    请求格式：{
+        "user_id": "user1",
+        "tags": ["标签1", "标签2"]  # 可为空
+    }
+    """
+    if request.content_type != 'application/json':
+        return jsonify({"error": "Content-Type must be application/json"}), 415
+
+    data = request.get_json()
+    user_id = data.get("user_id")
+    tags = set(data.get("tags", []))
+
+    # 查询用户已保存的兴趣标签
+    user_tags = set()
+    if user_id:
+        conn = get_db_conn()
+        try:
+            with conn.cursor() as cursor:
+                cursor.execute("SELECT t.name FROM user_tags ut JOIN tags t ON ut.tag_id = t.id WHERE ut.user_id=%s", (user_id,))
+                user_tags = set(row[0] for row in cursor.fetchall())
+        finally:
+            conn.close()
+
+    # 合并前端传递的tags和用户兴趣标签
+    all_tags = list(tags | user_tags)
+
+    if not all_tags:
+        return Response(json.dumps({"error": "暂无推荐结果"}, ensure_ascii=False), mimetype='application/json; charset=utf-8'), 200
+
+    conn = get_db_conn()
+    try:
+        with conn.cursor(pymysql.cursors.DictCursor) as cursor:
+            # 优先用tags字段匹配
+            # 先查找所有tag_id
+            tag_ids = []
+            for tag in all_tags:
+                cursor.execute("SELECT id FROM tags WHERE name=%s", (tag,))
+                row = cursor.fetchone()
+                if row:
+                    tag_ids.append(row['id'])
+            if not tag_ids:
+                return Response(json.dumps({"error": "暂无推荐结果"}, ensure_ascii=False), mimetype='application/json; charset=utf-8'), 200
+            tag_placeholders = ','.join(['%s'] * len(tag_ids))
+            sql = f"""
+                SELECT p.id, p.title, tp.name as category, p.heat,
+                       GROUP_CONCAT(tg.name) as tags
+                FROM posts p
+                LEFT JOIN post_tags pt ON p.id = pt.post_id
+                LEFT JOIN tags tg ON pt.tag_id = tg.id
+                LEFT JOIN topics tp ON p.topic_id = tp.id
+                WHERE pt.tag_id IN ({tag_placeholders})
+                GROUP BY p.id
+                LIMIT 50
+            """
+            cursor.execute(sql, tuple(tag_ids))
+            results = cursor.fetchall()
+            # 若无结果，回退title/content模糊匹配
+            if not results:
+                or_conditions = []
+                params = []
+                for tag in all_tags:
+                    or_conditions.append("p.title LIKE %s OR p.content LIKE %s")
+                    params.extend(['%' + tag + '%', '%' + tag + '%'])
+                where_clause = ' OR '.join(or_conditions)
+                sql = f"""
+                    SELECT p.id, p.title, tp.name as category, p.heat,
+                           GROUP_CONCAT(tg.name) as tags
+                    FROM posts p
+                    LEFT JOIN post_tags pt ON p.id = pt.post_id
+                    LEFT JOIN tags tg ON pt.tag_id = tg.id
+                    LEFT JOIN topics tp ON p.topic_id = tp.id
+                    WHERE {where_clause}
+                    GROUP BY p.id
+                    LIMIT 50
+                """
+                cursor.execute(sql, tuple(params))
+                results = cursor.fetchall()
+    finally:
+        conn.close()
+
+    if not results:
+        return Response(json.dumps({"error": "暂无推荐结果"}, ensure_ascii=False), mimetype='application/json; charset=utf-8'), 200
+
+    return Response(json.dumps({"recommendations": results}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+
+# 用户兴趣标签管理API（可选）
+@app.route('/tags', methods=['POST', 'GET', 'DELETE'])
+def user_tags():
+    """
+    POST: 添加用户兴趣标签
+    GET: 查询用户兴趣标签
+    DELETE: 删除用户兴趣标签
+    """
+    if request.method == 'POST':
+        if request.content_type != 'application/json':
+            return jsonify({"error": "Content-Type must be application/json"}), 415
+        data = request.get_json()
+        user_id = data.get("user_id")
+        tags = data.get("tags", [])
+        
+        if not user_id:
+            return jsonify({"error": "用户ID不能为空"}), 400
+        
+        # 确保标签列表格式正确
+        if isinstance(tags, str):
+            tags = [tag.strip() for tag in tags.split(',') if tag.strip()]
+        
+        if not tags:
+            return jsonify({"error": "标签不能为空"}), 400
+        
+        conn = get_db_conn()
+        try:
+            with conn.cursor() as cursor:
+                # 添加用户标签
+                for tag in tags:
+                    # 先查找tag_id
+                    cursor.execute("SELECT id FROM tags WHERE name=%s", (tag,))
+                    tag_row = cursor.fetchone()
+                    if tag_row:
+                        tag_id = tag_row[0]
+                        cursor.execute("REPLACE INTO user_tags (user_id, tag_id) VALUES (%s, %s)", (user_id, tag_id))
+                conn.commit()
+                # 返回更新后的标签列表
+                cursor.execute("SELECT t.name FROM user_tags ut JOIN tags t ON ut.tag_id = t.id WHERE ut.user_id=%s", (user_id,))
+                updated_tags = [row[0] for row in cursor.fetchall()]
+        finally:
+            conn.close()
+        return Response(json.dumps({"msg": "添加成功", "tags": updated_tags}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+    elif request.method == 'DELETE':
+        if request.content_type != 'application/json':
+            return jsonify({"error": "Content-Type must be application/json"}), 415
+        data = request.get_json()
+        user_id = data.get("user_id")
+        tags = data.get("tags", [])
+        if not user_id:
+            return jsonify({"error": "用户ID不能为空"}), 400
+        if not tags:
+            return jsonify({"error": "标签不能为空"}), 400
+        
+        conn = get_db_conn()
+        try:
+            with conn.cursor() as cursor:
+                for tag in tags:
+                    cursor.execute("SELECT id FROM tags WHERE name=%s", (tag,))
+                    tag_row = cursor.fetchone()
+                    if tag_row:
+                        tag_id = tag_row[0]
+                        cursor.execute("DELETE FROM user_tags WHERE user_id=%s AND tag_id=%s", (user_id, tag_id))
+                conn.commit()
+                cursor.execute("SELECT t.name FROM user_tags ut JOIN tags t ON ut.tag_id = t.id WHERE ut.user_id=%s", (user_id,))
+                remaining_tags = [row[0] for row in cursor.fetchall()]
+        finally:
+            conn.close()
+        return Response(json.dumps({"msg": "删除成功", "tags": remaining_tags}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+    else:  # GET 请求
+        user_id = request.args.get("user_id")
+        if not user_id:
+            return jsonify({"error": "用户ID不能为空"}), 400
+        conn = get_db_conn()
+        try:
+            with conn.cursor() as cursor:
+                cursor.execute("SELECT t.name FROM user_tags ut JOIN tags t ON ut.tag_id = t.id WHERE ut.user_id=%s", (user_id,))
+                tags = [row[0] for row in cursor.fetchall()]
+        finally:
+            conn.close()
+        return Response(json.dumps({"tags": tags}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+
+# 添加/user_tags路由作为/tags的别名
+@app.route('/user_tags', methods=['POST', 'GET', 'DELETE'])
+def user_tags_alias():
+    """
+    /user_tags路由 - 作为/tags路由的别名
+    POST: 添加用户兴趣标签
+    GET: 查询用户兴趣标签
+    DELETE: 删除用户兴趣标签
+    """
+    return user_tags()
+
+# 基于用户的协同过滤推荐API
+@app.route('/user_based_recommend', methods=['POST'])
+def user_based_recommend():
+    """
+    基于用户的协同过滤推荐API
+    请求格式：{
+        "user_id": "user1",
+        "top_n": 5
+    }
+    """
+    if request.content_type != 'application/json':
+        return jsonify({"error": "Content-Type must be application/json"}), 415
+
+    data = request.get_json()
+    user_id = data.get("user_id")
+    top_n = int(data.get("top_n", 5))
+
+    if not user_id:
+        return jsonify({"error": "用户ID不能为空"}), 400
+    
+    conn = get_db_conn()
+    try:
+        with conn.cursor(pymysql.cursors.DictCursor) as cursor:
+            # 1. 检查用户是否存在下载记录（收藏或浏览）
+            cursor.execute("""
+                SELECT COUNT(*) as count
+                FROM behaviors
+                WHERE user_id = %s AND type IN ('favorite', 'view')
+            """, (user_id,))
+            result = cursor.fetchone()
+            user_download_count = result['count'] if result else 0
+            
+            logger.info(f"用户 {user_id} 下载记录数: {user_download_count}")
+            
+            # 如果用户没有足够的行为数据，返回基于热度的推荐
+            if user_download_count < 3:
+                logger.info(f"用户 {user_id} 下载记录不足，返回热门推荐")
+                cursor.execute("""
+                    SELECT p.id, p.title, tp.name as category, p.heat
+                    FROM posts p
+                    LEFT JOIN topics tp ON p.topic_id = tp.id
+                    ORDER BY p.heat DESC
+                    LIMIT %s
+                """, (top_n,))
+                popular_seeds = cursor.fetchall()
+                return Response(json.dumps({"recommendations": popular_seeds, "type": "popular"}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+            
+            # 2. 获取用户已下载（收藏/浏览）的帖子
+            cursor.execute("""
+                SELECT post_id
+                FROM behaviors
+                WHERE user_id = %s AND type IN ('favorite', 'view')
+            """, (user_id,))
+            user_seeds = set(row['post_id'] for row in cursor.fetchall())
+            logger.info(f"用户 {user_id} 已下载种子: {user_seeds}")
+            
+            # 3. 获取所有用户-帖子下载（收藏/浏览）矩阵
+            cursor.execute("""
+                SELECT user_id, post_id
+                FROM behaviors
+                WHERE created_at > DATE_SUB(NOW(), INTERVAL 3 MONTH)
+                AND user_id <> %s AND type IN ('favorite', 'view')
+            """, (user_id,))
+            download_records = cursor.fetchall()
+            
+            if not download_records:
+                logger.info(f"没有其他用户的下载记录，返回热门推荐")
+                cursor.execute("""
+                    SELECT p.id, p.title, tp.name as category, p.heat
+                    FROM posts p
+                    LEFT JOIN topics tp ON p.topic_id = tp.id
+                    ORDER BY p.heat DESC
+                    LIMIT %s
+                """, (top_n,))
+                popular_seeds = cursor.fetchall()
+                return Response(json.dumps({"recommendations": popular_seeds, "type": "popular"}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+            
+            # 构建用户-物品矩阵
+            user_item_matrix = {}
+            for record in download_records:
+                uid = record['user_id']
+                sid = record['post_id']
+                if uid not in user_item_matrix:
+                    user_item_matrix[uid] = set()
+                user_item_matrix[uid].add(sid)
+            
+            # 4. 计算用户相似度
+            similar_users = []
+            for other_id, other_seeds in user_item_matrix.items():
+                if other_id == user_id:
+                    continue
+                intersection = len(user_seeds.intersection(other_seeds))
+                union = len(user_seeds.union(other_seeds))
+                if union > 0 and intersection > 0:
+                    similarity = intersection / union
+                    similar_users.append((other_id, similarity, other_seeds))
+            logger.info(f"找到 {len(similar_users)} 个相似用户")
+            similar_users.sort(key=lambda x: x[1], reverse=True)
+            similar_users = similar_users[:5]
+            # 5. 基于相似用户推荐帖子
+            candidate_seeds = {}
+            for similar_user, similarity, seeds in similar_users:
+                logger.info(f"相似用户 {similar_user}, 相似度 {similarity}")
+                for post_id in seeds:
+                    if post_id not in user_seeds:
+                        if post_id not in candidate_seeds:
+                            candidate_seeds[post_id] = 0
+                        candidate_seeds[post_id] += similarity
+            if not candidate_seeds:
+                logger.info(f"没有找到候选种子，返回热门推荐")
+                cursor.execute("""
+                    SELECT p.id, p.title, tp.name as category, p.heat
+                    FROM posts p
+                    LEFT JOIN topics tp ON p.topic_id = tp.id
+                    ORDER BY p.heat DESC
+                    LIMIT %s
+                """, (top_n,))
+                popular_seeds = cursor.fetchall()
+                return Response(json.dumps({"recommendations": popular_seeds, "type": "popular"}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+            # 6. 获取推荐帖子的详细信息
+            recommended_seeds = sorted(candidate_seeds.items(), key=lambda x: x[1], reverse=True)[:top_n]
+            post_ids = [post_id for post_id, _ in recommended_seeds]
+            format_strings = ','.join(['%s'] * len(post_ids))
+            cursor.execute(f"""
+                SELECT p.id, p.title, tp.name as category, p.heat
+                FROM posts p
+                LEFT JOIN topics tp ON p.topic_id = tp.id
+                WHERE p.id IN ({format_strings})
+            """, tuple(post_ids))
+            result_seeds = cursor.fetchall()
+            seed_score_map = {post_id: score for post_id, score in recommended_seeds}
+            result_seeds.sort(key=lambda x: seed_score_map.get(x['id'], 0), reverse=True)
+            logger.info(f"返回 {len(result_seeds)} 个基于协同过滤的推荐")
+            return Response(json.dumps({"recommendations": result_seeds, "type": "collaborative"}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+    except Exception as e:
+        logger.error(f"推荐系统错误: {e}")
+        import traceback
+        traceback.print_exc()
+        return Response(json.dumps({"error": "推荐系统异常，请稍后再试", "details": str(e)}, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+    finally:
+        conn.close()
+@app.route('/word2vec_status', methods=['GET'])
+def word2vec_status():
+    """
+    检查Word2Vec模型状态
+    返回模型是否加载、词汇量等信息
+    """
+    if not WORD2VEC_ENABLED:
+        return Response(json.dumps({
+            "enabled": False,
+            "message": "Word2Vec功能未启用"
+        }, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+    try:
+        helper = get_word2vec_helper()
+        status = {
+            "enabled": WORD2VEC_ENABLED,
+            "initialized": helper.initialized,
+            "vocab_size": len(helper.model.index_to_key) if helper.model else 0,
+            "vector_size": helper.model.vector_size if helper.model else 0
+        }
+        
+        # 测试几个常用词的相似词，展示模型效果
+        test_results = {}
+        test_words = ["电影", "动作", "科幻", "动漫", "游戏"]
+        for word in test_words:
+            similar_words = helper.get_similar_words(word, topn=5)
+            test_results[word] = similar_words
+        
+        status["test_results"] = test_results
+        return Response(json.dumps(status, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+    except Exception as e:
+        return Response(json.dumps({
+            "enabled": WORD2VEC_ENABLED,
+            "initialized": False,
+            "error": str(e)
+        }, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+
+# 添加一个临时诊断端点
+@app.route('/debug_search', methods=['POST'])
+def debug_search():
+    """临时的调试端点，用于检查数据库中的记录"""
+    if request.content_type != 'application/json':
+        return jsonify({"error": "Content-Type must be application/json"}), 415
+
+    data = request.get_json()
+    keyword = data.get("keyword", "").strip()
+    
+    conn = get_db_conn()
+    try:
+        with conn.cursor(pymysql.cursors.DictCursor) as cursor:
+            # 尝试查询包含特定词的所有记录
+            queries = [
+                ("标题中包含关键词", f"SELECT seed_id, title, description, tags FROM pt_seed WHERE title LIKE '%{keyword}%' LIMIT 10"),
+                ("描述中包含关键词", f"SELECT seed_id, title, description, tags FROM pt_seed WHERE description LIKE '%{keyword}%' LIMIT 10"),
+                ("标签中包含关键词", f"SELECT seed_id, title, description, tags FROM pt_seed WHERE FIND_IN_SET('{keyword}', tags) LIMIT 10"),
+                ("肖申克的救赎", "SELECT seed_id, title, description, tags FROM pt_seed WHERE title = '肖申克的救赎'")
+            ]
+            
+            results = {}
+            for query_name, query in queries:
+                cursor.execute(query)
+                results[query_name] = cursor.fetchall()
+                
+            return Response(json.dumps(results, ensure_ascii=False), mimetype='application/json; charset=utf-8')
+    finally:
+        conn.close()
+
+"""
+接口本地测试方法（可直接运行main_online.py后用curl或Postman测试）：
+
+1. 搜索接口
+curl -X POST http://127.0.0.1:5000/search -H "Content-Type: application/json" -d '{"keyword":"电影","sort_by":"downloads"}'
+
+2. 标签推荐接口
+curl -X POST http://127.0.0.1:5000/recommend_tags -H "Content-Type: application/json" -d '{"user_id":"1","tags":["动作","科幻"]}'
+
+3. 用户兴趣标签管理（添加标签）
+curl -X POST http://127.0.0.1:5000/user_tags -H "Content-Type: application/json" -d '{"user_id":"1","tags":["动作","科幻"]}'
+
+4. 用户兴趣标签管理（查询标签）
+curl "http://127.0.0.1:5000/user_tags?user_id=1"
+
+5. 用户兴趣标签管理（删除标签）
+curl -X DELETE http://127.0.0.1:5000/user_tags -H "Content-Type: application/json" -d '{"user_id":"1","tags":["动作","科幻"]}'
+
+6. 协同过滤推荐
+curl -X POST http://127.0.0.1:5000/user_based_recommend -H "Content-Type: application/json" -d '{"user_id":"user1","top_n":3}'
+
+7. Word2Vec状态检查
+curl "http://127.0.0.1:5000/word2vec_status"
+
+8. 调试接口（临时）
+curl -X POST http://127.0.0.1:5000/debug_search -H "Content-Type: application/json" -d '{"keyword":"电影"}'
+
+所有接口均可用Postman按上述参数测试。
+"""
+
+if __name__ == "__main__":
+    try:
+        logger.info("搜索推荐服务启动中...")
+        app.run(host="0.0.0.0", port=5000)
+    except Exception as e:
+        logger.error(f"启动异常: {e}")
+        import traceback
+        traceback.print_exc()

diff --git a/JWLLL/semantic_config.json b/JWLLL/semantic_config.json
new file mode 100644
index 0000000..9f54454
--- /dev/null
+++ b/JWLLL/semantic_config.json

@@ -0,0 +1,73 @@
+{
+  "国宝": ["熊猫", "大熊猫", "功夫熊猫", "四川", "成都", "保护动物"],
+  "熊猫": ["国宝", "大熊猫", "功夫熊猫", "竹子", "四川", "黑白"],
+  "功夫": ["武术", "格斗", "武打", "功夫熊猫", "李小龙", "成龙", "太极", "截拳道", "中国功夫"],
+  
+  "梦": ["梦想", "梦境", "白日梦", "白日梦想家", "潜意识", "睡眠", "做梦"],
+  "白日梦": ["梦想", "幻想", "白日梦想家", "想象", "憧憬"],
+  
+  "魔戒": ["指环王", "魔戒再现", "中土世界", "霍比特人", "精灵", "魔法", "奇幻"],
+  "指环": ["魔戒", "指环王", "戒指", "魔戒再现", "首饰"],
+  "中土世界": ["魔戒", "指环王", "霍比特人", "精灵", "矮人", "奇幻"],
+  
+  "漫威": ["复仇者", "钢铁侠", "蜘蛛侠", "美国队长", "雷神", "绿巨人", "黑寡妇", "惊奇队长", "超级英雄", "漫画"],
+  "钢铁侠": ["托尼斯塔克", "钢铁战衣", "贾维斯", "复仇者", "漫威", "超级英雄"],
+  "蜘蛛侠": ["彼得帕克", "蜘蛛", "纽约", "漫威", "超级英雄", "蜘蛛感应"],
+  
+  "DC": ["蝙蝠侠", "超人", "神奇女侠", "正义联盟", "闪电侠", "水行侠", "超级英雄", "漫画"],
+  "蝙蝠侠": ["布鲁斯韦恩", "高谭市", "小丑", "罗宾", "DC", "超级英雄"],
+  "超人": ["克拉克肯特", "氪星", "莱克斯卢瑟", "超能力", "DC", "超级英雄"],
+  
+  "星球大战": ["星战", "原力", "天行者", "达斯维达", "尤达", "绝地武士", "光剑", "帝国", "科幻"],
+  "原力": ["绝地武士", "星球大战", "天行者", "尤达", "光剑", "西斯", "科幻"],
+  
+  "哈利波特": ["魔法", "霍格沃茨", "魔杖", "魔法石", "伏地魔", "巫师", "奇幻", "魔幻"],
+  "魔法": ["巫师", "法术", "咒语", "哈利波特", "霍格沃茨", "魔杖", "奇幻", "魔幻"],
+  
+  "科幻": ["未来", "太空", "星际", "外星人", "人工智能", "机器人", "时空", "星球大战", "星际穿越"],
+  "太空": ["宇宙", "星球", "卫星", "宇航员", "航天", "科幻", "星际", "外太空"],
+  "人工智能": ["AI", "机器学习", "深度学习", "神经网络", "机器人", "算法", "科技", "科幻"],
+  
+  "动作": ["武打", "格斗", "功夫", "特技", "追逐", "冒险", "刺激", "爆破"],
+  "冒险": ["探险", "奇遇", "探索", "未知", "旅程", "冒险家", "刺激", "危险"],
+  "奇幻": ["魔法", "魔幻", "神话", "异世界", "精灵", "龙", "魔戒", "哈利波特"],
+  
+  "悬疑": ["推理", "谜题", "侦探", "神秘", "悬念", "惊悚", "犯罪", "悬疑片"],
+  "推理": ["侦探", "线索", "谜题", "破案", "悬疑", "逻辑", "智力", "悬疑片"],
+  
+  "恐怖": ["惊悚", "鬼怪", "恶魔", "惊吓", "血腥", "恐怖片", "心理恐惧", "超自然"],
+  "鬼怪": ["幽灵", "鬼魂", "妖怪", "超自然", "恐怖", "惊悚", "诡异", "恐怖片"],
+  
+  "喜剧": ["搞笑", "幽默", "欢乐", "笑声", "喜剧片", "滑稽", "逗乐", "喜剧演员"],
+  "搞笑": ["幽默", "笑话", "喜剧", "逗乐", "滑稽", "欢乐", "喜剧片", "喜剧演员"],
+  
+  "战争": ["军事", "战场", "士兵", "军队", "战役", "武器", "战争片", "历史战争"],
+  "军事": ["军队", "武器", "战争", "军人", "战略", "战术", "国防", "军事片"],
+  
+  "剧情": ["情节", "故事", "叙事", "人物", "感人", "真实", "戏剧性", "剧情片"],
+  "历史": ["古代", "历史事件", "历史人物", "朝代", "文明", "历史片", "传记", "纪实"],
+  
+  "纪录片": ["真实记录", "纪实", "历史", "自然", "科学", "社会", "文化", "探索"],
+  "动画": ["卡通", "动漫", "动画片", "动画电影", "CG", "3D动画", "手绘", "二次元"],
+  
+  "音乐": ["歌曲", "旋律", "节奏", "乐器", "演唱", "音乐家", "音乐剧", "音乐会"],
+  "歌曲": ["歌词", "唱歌", "歌手", "流行歌曲", "音乐", "专辑", "单曲", "MV"],
+  
+  "爱情": ["恋爱", "浪漫", "情侣", "爱情故事", "爱情片", "感情", "爱意", "约会"],
+  "浪漫": ["爱情", "情感", "温馨", "甜蜜", "爱意", "爱情片", "情侣", "表白"],
+  
+  "Netflix": ["网飞", "流媒体", "自制剧", "电视剧", "纸牌屋", "怪奇物语", "王冠", "订阅"],
+  "迪士尼": ["米老鼠", "唐老鸭", "公主", "动画", "迪士尼乐园", "皮克斯", "童话", "漫威"],
+  
+  "游戏": ["电子游戏", "游戏机", "主机游戏", "PC游戏", "手游", "网游", "单机", "多人游戏"],
+  "动漫": ["日本动画", "漫画", "二次元", "动画", "动画片", "ACGN", "宅文化", "御宅族"],
+  
+  "日本": ["东京", "京都", "大阪", "日本文化", "日本料理", "樱花", "动漫", "武士道"],
+  "美国": ["纽约", "洛杉矶", "华盛顿", "美国文化", "好莱坞", "自由女神像", "美式"],
+  
+  "教育": ["学习", "知识", "课程", "教学", "学校", "教科书", "老师", "学生"],
+  "技术": ["科技", "工程", "编程", "软件", "硬件", "开发", "技术革新", "IT"],
+  
+  "监狱": ["越狱", "囚犯", "牢房", "服刑", "狱警"],
+  "越狱": ["监狱", "囚犯", "逃狱", "越狱计划", "监狱逃脱"]
+}

diff --git a/JWLLL/word2vec_helper.py b/JWLLL/word2vec_helper.py
new file mode 100644
index 0000000..ecd1a72
--- /dev/null
+++ b/JWLLL/word2vec_helper.py

@@ -0,0 +1,279 @@
+# word2vec_helper.py
+# Word2Vec模型加载与使用的辅助模块
+
+import os
+import numpy as np
+from gensim.models import KeyedVectors, Word2Vec
+import jieba
+import logging
+import time
+
+# 设置日志
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+
+class Word2VecHelper:
+    def __init__(self, model_path=None):
+        """
+        初始化Word2Vec辅助类
+        
+        参数:
+            model_path: 预训练模型路径，支持word2vec格式和二进制格式
+                        如果为None，将使用默认路径或尝试下载小型模型
+        """
+        self.model = None
+        
+        # 更改默认模型路径和备用选项
+        if model_path:
+            self.model_path = model_path
+        else:
+            # 首选路径 - 大型腾讯模型
+            primary_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 
+                                      "models", "chinese_word2vec.bin")
+            
+            # 备用路径 - 小型模型
+            backup_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 
+                                     "models", "chinese_word2vec_small.bin")
+            
+            if os.path.exists(primary_path):
+                self.model_path = primary_path
+            elif os.path.exists(backup_path):
+                self.model_path = backup_path
+            else:
+                # 如果都不存在，可以尝试自动下载小模型
+                self.model_path = primary_path
+                self._try_download_small_model()
+        
+        self.initialized = False
+        # 缓存查询结果，提高性能
+        self.similarity_cache = {}
+        self.similar_words_cache = {}
+    
+    def _try_download_small_model(self):
+        """尝试下载小型词向量模型作为备用选项"""
+        try:
+            import gensim.downloader as api
+            logging.info("尝试下载小型中文词向量模型...")
+            
+            # 创建模型目录
+            os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
+            
+            # 尝试下载fastText的小型中文模型
+            small_model = api.load("fasttext-wiki-news-subwords-300")
+            small_model.save(self.model_path.replace(".bin", "_small.bin"))
+            logging.info(f"小型模型已下载并保存到 {self.model_path}")
+        except Exception as e:
+            logging.error(f"无法下载备用模型: {e}")
+
+    def load_model(self):
+        """加载Word2Vec模型"""
+        try:
+            start_time = time.time()
+            logging.info(f"开始加载Word2Vec模型: {self.model_path}")
+            
+            # 判断文件扩展名，选择合适的加载方式
+            if self.model_path.endswith('.bin'):
+                # 加载二进制格式的模型
+                self.model = KeyedVectors.load_word2vec_format(self.model_path, binary=True)
+            else:
+                # 加载文本格式的模型或gensim模型
+                self.model = Word2Vec.load(self.model_path).wv
+                
+            self.initialized = True
+            logging.info(f"Word2Vec模型加载完成，耗时 {time.time() - start_time:.2f} 秒")
+            logging.info(f"词向量维度: {self.model.vector_size}")
+            logging.info(f"词汇表大小: {len(self.model.index_to_key)}")
+            return True
+        except Exception as e:
+            logging.error(f"加载Word2Vec模型失败: {e}")
+            self.initialized = False
+            return False
+    
+    def ensure_initialized(self):
+        """确保模型已初始化"""
+        if not self.initialized:
+            return self.load_model()
+        return True
+    
+    def get_similar_words(self, word, topn=10, min_similarity=0.5):
+        """
+        获取与给定词语最相似的词语列表
+        
+        参数:
+            word: 输入词语
+            topn: 返回相似词的数量
+            min_similarity: 最小相似度阈值
+        返回:
+            相似词列表，如果词不存在或模型未加载则返回空列表
+        """
+        if not self.ensure_initialized():
+            return []
+            
+        # 检查缓存
+        cache_key = f"{word}_{topn}_{min_similarity}"
+        if cache_key in self.similar_words_cache:
+            return self.similar_words_cache[cache_key]
+        
+        try:
+            # 如果词不在词汇表中，进行分词处理
+            if word not in self.model.key_to_index:
+                # 对中文词进行分词，然后查找每个子词的相似词
+                word_parts = list(jieba.cut(word))
+                
+                if not word_parts:
+                    return []
+                
+                # 如果存在多个子词，找到存在于模型中的子词
+                valid_parts = [w for w in word_parts if w in self.model.key_to_index]
+                
+                if not valid_parts:
+                    return []
+                
+                # 使用最长的有效子词或第一个有效子词
+                valid_parts.sort(key=len, reverse=True)
+                word = valid_parts[0]
+                
+                # 如果替换后的词仍不在词汇表中，返回空列表
+                if word not in self.model.key_to_index:
+                    return []
+            
+            # 获取相似词
+            similar_words = self.model.most_similar(word, topn=topn*2)  # 多获取一些，后续过滤
+            
+            # 过滤低于阈值的结果，并只返回词语(不返回相似度)
+            filtered_words = [w for w, sim in similar_words if sim >= min_similarity][:topn]
+            
+            # 缓存结果
+            self.similar_words_cache[cache_key] = filtered_words
+            return filtered_words
+            
+        except Exception as e:
+            logging.error(f"获取相似词失败: {e}, 词语: {word}")
+            return []
+    
+    def calculate_similarity(self, word1, word2):
+        """
+        计算两个词的相似度
+        
+        参数:
+            word1, word2: 输入词语
+        返回:
+            相似度分数(0-1)，如果任意词不存在则返回0
+        """
+        if not self.ensure_initialized():
+            return 0
+            
+        # 检查缓存
+        cache_key = f"{word1}_{word2}"
+        reverse_key = f"{word2}_{word1}"
+        
+        if cache_key in self.similarity_cache:
+            return self.similarity_cache[cache_key]
+        if reverse_key in self.similarity_cache:
+            return self.similarity_cache[reverse_key]
+        
+        try:
+            # 检查词是否在词汇表中
+            if word1 not in self.model.key_to_index or word2 not in self.model.key_to_index:
+                return 0
+            
+            similarity = self.model.similarity(word1, word2)
+            
+            # 缓存结果
+            self.similarity_cache[cache_key] = similarity
+            return similarity
+            
+        except Exception as e:
+            logging.error(f"计算相似度失败: {e}, 词语: {word1}, {word2}")
+            return 0
+    
+    def expand_query(self, query, topn=5, min_similarity=0.6):
+        """
+        扩展查询词，返回相关词汇
+        
+        参数:
+            query: 查询词
+            topn: 每个词扩展的相似词数量
+            min_similarity: 最小相似度阈值
+        返回:
+            扩展后的词语列表
+        """
+        if not self.ensure_initialized():
+            return [query]
+            
+        expanded_terms = [query]
+        
+        # 对查询进行分词
+        words = list(jieba.cut(query))
+        
+        # 为每个词找相似词
+        for word in words:
+            if len(word) <= 1:  # 忽略单字，减少噪音
+                continue
+                
+            similar_words = self.get_similar_words(word, topn=topn, min_similarity=min_similarity)
+            expanded_terms.extend(similar_words)
+        
+        # 确保唯一性
+        return list(set(expanded_terms))
+
+# 单例模式，全局使用一个模型实例
+_word2vec_helper = None
+
+def get_word2vec_helper(model_path=None):
+    """获取Word2Vec辅助类的全局单例"""
+    global _word2vec_helper
+    if _word2vec_helper is None:
+        _word2vec_helper = Word2VecHelper(model_path)
+        _word2vec_helper.ensure_initialized()
+    return _word2vec_helper
+
+# 便捷函数，方便直接调用
+def get_similar_words(word, topn=10, min_similarity=0.5):
+    """获取相似词的便捷函数"""
+    helper = get_word2vec_helper()
+    return helper.get_similar_words(word, topn, min_similarity)
+
+def calculate_similarity(word1, word2):
+    """计算相似度的便捷函数"""
+    helper = get_word2vec_helper()
+    return helper.calculate_similarity(word1, word2)
+
+def expand_query(query, topn=5, min_similarity=0.6):
+    """扩展查询的便捷函数"""
+    helper = get_word2vec_helper()
+    return helper.expand_query(query, topn, min_similarity)
+
+# 使用示例
+if __name__ == "__main__":
+    # 测试模型加载和词语相似度
+    helper = get_word2vec_helper()
+    
+    # 测试词
+    test_words = ["电影", "功夫", "熊猫", "科幻", "漫威"]
+    
+    for word in test_words:
+        print(f"\n{word} 的相似词:")
+        similar = helper.get_similar_words(word, topn=5)
+        for sim_word in similar:
+            print(f"  - {sim_word}")
+    
+    # 测试相似度计算
+    word_pairs = [
+        ("电影", "电视"),
+        ("功夫", "武术"),
+        ("科幻", "未来"),
+        ("漫威", "超级英雄")
+    ]
+    
+    print("\n词语相似度:")
+    for w1, w2 in word_pairs:
+        sim = helper.calculate_similarity(w1, w2)
+        print(f"  {w1} <-> {w2}: {sim:.4f}")
+    
+    # 测试查询扩展
+    test_queries = ["功夫熊猫", "科幻电影", "漫威英雄"]
+    
+    print("\n查询扩展:")
+    for query in test_queries:
+        expanded = helper.expand_query(query)
+        print(f"  {query} -> {expanded}")

diff --git "a/JWLLL/\346\216\245\345\217\243\346\265\213\350\257\225\350\257\264\346\230\216.md" "b/JWLLL/\346\216\245\345\217\243\346\265\213\350\257\225\350\257\264\346\230\216.md"
new file mode 100644
index 0000000..1537055
--- /dev/null
+++ "b/JWLLL/\346\216\245\345\217\243\346\265\213\350\257\225\350\257\264\346\230\216.md"

@@ -0,0 +1,186 @@
+# 接口说明文档
+
+本服务为资源搜索与推荐API，所有接口均支持Postman测试。每个接口均包含功能说明、请求方式、参数、返回值、详细Postman测试方法，并补充了核心逻辑和原理说明。
+
+---
+
+## 1. 搜索接口
+- **接口功能**：根据关键词、分类、标签等条件搜索资源。
+- **核心逻辑与原理**：
+  - 支持关键词分词、拼音、语义扩展（包括自定义语义映射和Word2Vec相似词扩展）。
+  - 支持多字段（标题、内容、分类、标签）模糊匹配。
+  - 相关性打分综合考虑精确匹配、分词、拼音、标签、描述、分类等多种因素。
+  - 支持多种排序方式（热度、时间、相似度等）。
+- **请求方式**：POST
+- **URL**：`/search`
+- **请求参数**：
+  | 参数名      | 类型    | 必填 | 说明                                   |
+  | ----------- | ------- | ---- | -------------------------------------- |
+  | keyword     | string  | 是   | 搜索关键词                             |
+  | sort_by     | string  | 否   | 排序方式（downloads、similarity等）    |
+  | category    | string  | 否   | 分类名                                 |
+  | search_mode | string  | 否   | 搜索模式（title、title_desc、all等）   |
+  | tags        | array   | 否   | 标签数组                               |
+- **返回说明**：
+  - results: 资源列表，每项包含id、title、category、heat、created_at等字段。
+
+**Postman测试方法：**
+1. 新建POST请求，URL填`http://127.0.0.1:5000/search`
+2. Body选择raw，类型JSON，内容示例：
+   ```json
+   {
+     "keyword": "电影",
+     "sort_by": "downloads"
+   }
+   ```
+3. 点击Send，查看返回结果。
+
+---
+
+## 2. 标签推荐接口
+- **接口功能**：根据用户兴趣标签推荐相关资源。
+- **核心逻辑与原理**：
+  - 首先根据用户兴趣标签（user_tags表+tags表）查找相关资源。
+  - 若无结果，则用标签名模糊匹配资源标题和内容。
+  - 推荐结果按相关性和热度排序。
+- **请求方式**：POST
+- **URL**：`/recommend_tags`
+- **请求参数**：
+  | 参数名   | 类型    | 必填 | 说明           |
+  | -------- | ------- | ---- | -------------- |
+  | user_id  | string  | 是   | 用户ID         |
+  | tags     | array   | 否   | 用户关注标签   |
+- **返回说明**：
+  - recommendations: 推荐资源列表。
+
+**Postman测试方法：**
+1. 新建POST请求，URL填`http://127.0.0.1:5000/recommend_tags`
+2. Body选择raw，类型JSON，内容示例：
+   ```json
+   {
+     "user_id": "1",
+     "tags": ["动作", "科幻"]
+   }
+   ```
+3. 点击Send，查看推荐结果。
+
+---
+
+## 3. 用户兴趣标签管理接口
+- **接口功能**：管理用户兴趣标签（增删查）。
+- **核心逻辑与原理**：
+  - 用户标签数据存储在 user_tags 表，通过 tag_id 关联 tags 表，所有操作均以标签名为主。
+  - 支持添加、删除、查询用户兴趣标签。
+- **请求方式**：POST/GET/DELETE
+- **URL**：`/user_tags` 或 `/tags`
+- **请求参数**：
+  - POST/DELETE：
+    | 参数名  | 类型    | 必填 | 说明     |
+    | ------- | ------- | ---- | -------- |
+    | user_id | string  | 是   | 用户ID   |
+    | tags    | array   | 是   | 标签数组 |
+  - GET：
+    | 参数名  | 类型    | 必填 | 说明     |
+    | ------- | ------- | ---- | -------- |
+    | user_id | string  | 是   | 用户ID   |
+- **返回说明**：
+  - tags: 用户当前兴趣标签列表。
+
+**Postman测试方法：**
+- 添加标签（POST）：
+  1. 新建POST请求，URL填`http://127.0.0.1:5000/user_tags`
+  2. Body选择raw，类型JSON：
+     ```json
+     {
+       "user_id": "1",
+       "tags": ["动作", "科幻"]
+     }
+     ```
+  3. Send。
+- 查询标签（GET）：
+  1. 新建GET请求，URL填`http://127.0.0.1:5000/user_tags?user_id=1`
+  2. Send。
+- 删除标签（DELETE）：
+  1. 新建DELETE请求，URL填`http://127.0.0.1:5000/user_tags`
+  2. Body选择raw，类型JSON：
+     ```json
+     {
+       "user_id": "1",
+       "tags": ["动作", "科幻"]
+     }
+     ```
+  3. Send。
+
+---
+
+## 4. 协同过滤推荐接口
+- **接口功能**：基于用户行为的个性化推荐。
+- **核心逻辑与原理**：
+  - 基于 behaviors 表的用户行为（type='favorite' 或 'view'）构建用户-物品矩阵。
+  - 计算用户与其他用户的兴趣重叠度（Jaccard相似度=交集/并集），找出最相似的用户。
+  - 推荐这些相似用户收藏/浏览过、但当前用户未看过的帖子。
+  - 若用户行为数据不足或无相似用户，则推荐全站热门资源。
+- **请求方式**：POST
+- **URL**：`/user_based_recommend`
+- **请求参数**：
+  | 参数名  | 类型    | 必填 | 说明         |
+  | ------- | ------- | ---- | ------------ |
+  | user_id | string  | 是   | 用户ID       |
+  | top_n   | int     | 否   | 推荐数量     |
+- **返回说明**：
+  - recommendations: 推荐资源列表。
+
+**Postman测试方法：**
+1. 新建POST请求，URL填`http://127.0.0.1:5000/user_based_recommend`
+2. Body选择raw，类型JSON：
+   ```json
+   {
+     "user_id": "1",
+     "top_n": 3
+   }
+   ```
+3. Send。
+
+---
+
+## 5. Word2Vec状态检查接口
+- **接口功能**：检查Word2Vec模型加载状态。
+- **核心逻辑与原理**：
+  - 检查Word2Vec模型是否加载成功，返回词汇量、向量维度、部分词的相似词等信息。
+- **请求方式**：GET
+- **URL**：`/word2vec_status`
+- **返回说明**：
+  - enabled, initialized, vocab_size, vector_size, test_results等。
+
+**Postman测试方法：**
+1. 新建GET请求，URL填`http://127.0.0.1:5000/word2vec_status`
+2. Send。
+
+---
+
+## 6. 调试接口
+- **接口功能**：数据库调试与数据检查。
+- **核心逻辑与原理**：
+  - 通过多种SQL查询，辅助开发者调试数据库内容和搜索命中情况。
+- **请求方式**：POST
+- **URL**：`/debug_search`
+- **请求参数**：
+  | 参数名  | 类型    | 必填 | 说明     |
+  | ------- | ------- | ---- | -------- |
+  | keyword | string  | 是   | 关键词   |
+- **返回说明**：
+  - 各类调试用的数据库查询结果。
+
+**Postman测试方法：**
+1. 新建POST请求，URL填`http://127.0.0.1:5000/debug_search`
+2. Body选择raw，类型JSON：
+   ```json
+   {
+     "keyword": "电影"
+   }
+   ```
+3. Send。
+
+---
+
+如需补充其它接口或参数说明，随时联系！
commit	c60688d6eefe9ea5322a6807a13d1c06a016f151	[log] [tgz]
author	956303669 <956303669@qq.com>	Sat Jun 14 22:03:10 2025 +0800
committer	Gerrit Code Review <root@debian>	Sat Jun 14 22:03:10 2025 +0800
tree	dd00fed24e80043c974853ec575157ffb00f808c
parent	cae762d729ecc9fc3f30c26cfde42bd4b06bb5c4 [diff]
parent	a520ffd4934743d11f24891080107b6de9b08633 [diff]