feat: 加入了download_model.py进行模型的下载和训练,修复了recommend.py的bug

Change-Id: I72fb3fdb160ff73934396e2127ae6432e8a727c8
diff --git a/recommend/recommend.py b/recommend/recommend.py
new file mode 100644
index 0000000..25032a0
--- /dev/null
+++ b/recommend/recommend.py
@@ -0,0 +1,250 @@
+import os
+import time
+import jieba
+import fasttext
+import pandas as pd
+from flask import Flask, request, jsonify
+from sqlalchemy import create_engine
+from scipy.sparse import coo_matrix
+from sklearn.metrics.pairwise import cosine_similarity
+import pickle
+
+app = Flask(__name__)
+
+# === ✅ SQLAlchemy 数据库连接 ===
+engine = create_engine("mysql+pymysql://sy:sy_password@49.233.215.144:3306/pt_station")
+
+# === ✅ 加载 fastText 模型 ===
+fasttext_model_path = 'E:\\course\\pt\\recommend\\models\\cc.zh.300.bin'
+if not os.path.exists(fasttext_model_path):
+    raise FileNotFoundError("fastText 模型文件不存在,请检查路径。")
+print("加载 fastText 模型中...")
+ft_model = fasttext.load_model(fasttext_model_path)
+print("模型加载完成 ✅")
+
+# === ✅ 用户标签行为矩阵构建 ===
+def get_user_tag_matrix():
+    df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine)
+    print(df)
+    df['user_id'] = df['user_id'].astype(str)
+    user_map = {u: i for i, u in enumerate(df['user_id'].unique())}
+    tag_map = {t: i for i, t in enumerate(df['tag'].unique())}
+    df['user_index'] = df['user_id'].map(user_map)
+    df['tag_index'] = df['tag'].map(tag_map)
+    matrix = df.pivot_table(index='user_id', columns='tag', values='score', fill_value=0)
+    sparse_matrix = coo_matrix((df['score'], (df['tag_index'], df['user_index'])))
+    return df, matrix, sparse_matrix, user_map, tag_map
+
+# === ✅ 基于 fastText 的语义相似推荐方法 ===
+def semantic_recommend(user_id, topn=5):
+    print(f"正在为用户 {user_id} 生成推荐...")
+
+    # 读取数据库中的用户标签数据
+    df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine)
+    print(f"总记录数: {len(df)}")
+    print(f"数据示例:\n{df.head()}")
+    print(df.dtypes)
+    user_id = str(user_id)  # 确保匹配
+
+    # 获取该用户的所有标签(按分数从高到低排序)
+    user_tags = df[df['user_id'] == user_id].sort_values(by="score", ascending=False)['tag'].tolist()
+    print(f"用户 {user_id} 的标签(按分数排序): {user_tags}")
+
+    if not user_tags:
+        print(f"用户 {user_id} 没有标签记录,返回空推荐结果。")
+        return []
+
+    # 截取前 3 个标签作为“兴趣标签”
+    user_tags = user_tags[:3]
+    print(f"用户 {user_id} 的 Top 3 标签: {user_tags}")
+
+    # 构造所有标签的词向量
+    all_tags = df['tag'].unique()
+    print(f"所有唯一标签数量: {len(all_tags)}")
+
+    tag_vectors = {}
+    for tag in all_tags:
+        vec = ft_model.get_word_vector(tag)
+        tag_vectors[tag] = vec
+
+    # 计算未出现过标签的相似度得分
+    scores = {}
+    for tag in all_tags:
+        if tag in user_tags:
+            continue
+        vec = tag_vectors[tag]
+        sim_total = 0.0
+        for t in user_tags:
+            sim = cosine_similarity([vec], [ft_model.get_word_vector(t)])[0][0]
+            print(f"标签 [{tag}] 与用户标签 [{t}] 的相似度: {sim:.4f}")
+            sim_total += sim
+        avg_score = sim_total / len(user_tags)
+        scores[tag] = avg_score
+        print(f"标签 [{tag}] 的平均相似度得分: {avg_score:.4f}")
+
+    # 排序并返回 topN 标签
+    sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topn]
+    print(f"\n最终推荐标签(前 {topn}):")
+    for tag, score in sorted_tags:
+        print(f"{tag}: {score:.4f}")
+
+    return [tag for tag, _ in sorted_tags]
+
+# === ✅ ItemCF 推荐方法 ===
+import os
+import pickle
+
+def itemcf_recommend(user_id, matrix, sim_path="./models/itemcf_sim.pkl", topn=5):
+    user_id = str(user_id)  # 确保 user_id 类型一致
+    print(matrix.index.dtype)
+    print(type(user_id))  # 应该是 str
+
+    if user_id not in matrix.index:
+        print(f"⚠️ 用户 {user_id} 不在评分矩阵中。")
+        return []
+
+    if not os.path.exists(sim_path):
+        print(f"⚠️ 用户 {user_id} 不在评分矩阵中。")
+        train_and_save_itemcf()
+
+    with open(sim_path, "rb") as f:
+        sim_df = pickle.load(f)
+
+    user_row = matrix.loc[user_id]
+    user_tags = user_row[user_row > 0]
+
+    if user_tags.empty:
+        print(f"⚠️ 用户 {user_id} 没有任何标签评分记录。")
+        return []
+
+    print(f"用户 {user_id} 的标签评分:\n{user_tags}")
+
+    scores = {}
+    for tag, val in user_tags.items():
+        if tag not in sim_df:
+            print(f"标签 {tag} 在相似度矩阵中不存在,跳过。")
+            continue
+        sims = sim_df[tag].drop(index=user_tags.index, errors="ignore")
+        for sim_tag, sim_score in sims.items():
+            scores[sim_tag] = scores.get(sim_tag, 0) + sim_score * val
+
+    if not scores:
+        print(f"⚠️ 用户 {user_id} 无法生成推荐,可能是标签相似度不足。")
+        return []
+
+    sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    print(f"推荐得分(前{topn}):\n", sorted_tags[:topn])
+
+    return [tag for tag, _ in sorted_tags[:topn]]
+
+
+# === ✅ ItemCF 相似度训练 ===
+def train_and_save_itemcf(path="./models/itemcf_sim.pkl"):
+    _, matrix, _, _, _ = get_user_tag_matrix()
+    tag_sim = cosine_similarity(matrix.T)
+    sim_df = pd.DataFrame(tag_sim, index=matrix.columns, columns=matrix.columns)
+    with open(path, "wb") as f:
+        pickle.dump(sim_df, f)
+    print("ItemCF 相似度矩阵已保存 ✅")
+
+# === ✅ Flask 推荐接口 ===
+import random
+
+@app.route("/recommend_torrents", methods=["POST"])
+def recommend_torrents():
+    data = request.get_json()
+    user_id = data.get("user_id")
+
+    if not user_id:
+        return jsonify({"error": "缺少 user_id"}), 400
+
+    df, matrix, _, _, _ = get_user_tag_matrix()
+
+    # 获取推荐标签
+    itemcf_result = itemcf_recommend(user_id, matrix)
+    semantic_result = semantic_recommend(user_id)
+
+
+    print(f"ItemCF 推荐标签: {itemcf_result}")
+    print(f"Semantic 推荐标签: {semantic_result}")
+
+    all_tags = df['tag'].unique().tolist()
+
+    # 存储标签及其推荐得分
+    combined = []
+    used_tags = set()
+
+    def add_unique_tags(tags, method_name):
+        for tag in tags:
+            if tag not in used_tags:
+                random_priority = random.uniform(0, 1)
+                if method_name == 'ItemCF':
+                    combined.append((tag, 'ItemCF', random_priority))
+                elif method_name == 'Semantic':
+                    combined.append((tag, 'Semantic', random_priority))
+                used_tags.add(tag)
+
+    # 添加 ItemCF 和 Semantic 推荐
+    add_unique_tags(itemcf_result, 'ItemCF')
+    add_unique_tags(semantic_result, 'Semantic')
+
+    # 添加随机标签
+    random.shuffle(all_tags)
+    add_unique_tags(all_tags, 'Random')
+
+    # 排序:按推荐得分排序,加入的随机值也会影响排序
+    combined.sort(key=lambda x: x[2], reverse=True)
+
+    # 根据标签获取种子 ID
+    final_tags = [tag for tag, _, _ in combined]
+    print(f"最终推荐标签: {final_tags}")
+    torrent_ids = get_torrent_ids_by_tags(final_tags)
+
+    return jsonify({"torrent_ids": torrent_ids})
+
+
+
+from sqlalchemy.sql import text
+
+import random
+from sqlalchemy import text
+
+def get_torrent_ids_by_tags(tags, limit_per_tag=10):
+    if not tags:
+        tags = []
+
+    recommended_ids = set()
+    with engine.connect() as conn:
+        for tag in tags:
+            query = text("""
+                SELECT torrent_id
+                FROM bt_torrent_tags 
+                WHERE tag = :tag 
+                LIMIT :limit
+            """)
+            result = conn.execute(query, {"tag": tag, "limit": limit_per_tag})
+            for row in result:
+                recommended_ids.add(row[0])
+
+        # 获取数据库中所有 torrent_id
+        all_query = text("SELECT DISTINCT torrent_id FROM bt_torrent_tags")
+        all_result = conn.execute(all_query)
+        all_ids = set(row[0] for row in all_result)
+
+    # 剩下的(非推荐)种子 ID
+    remaining_ids = all_ids - recommended_ids
+
+    # 随机打乱推荐和剩下的 ID
+    recommended_list = list(recommended_ids)
+    remaining_list = list(remaining_ids)
+    random.shuffle(recommended_list)
+    random.shuffle(remaining_list)
+
+    return recommended_list + remaining_list
+
+
+# === ✅ 启动服务 ===
+if __name__ == '__main__':
+    train_and_save_itemcf()
+    from waitress import serve
+    serve(app, host="0.0.0.0", port=5000, threads=16)