| import os |
| import time |
| import jieba |
| import fasttext |
| import pandas as pd |
| from flask import Flask, request, jsonify |
| from sqlalchemy import create_engine |
| from scipy.sparse import coo_matrix |
| from sklearn.metrics.pairwise import cosine_similarity |
| import pickle |
| |
| app = Flask(__name__) |
| |
| # === ✅ SQLAlchemy 数据库连接 === |
| engine = create_engine("mysql+pymysql://sy:sy_password@49.233.215.144:3306/pt_station") |
| |
| # === ✅ 加载 fastText 模型 === |
| fasttext_model_path = 'E:\\course\\pt\\recommend\\models\\cc.zh.300.bin' |
| if not os.path.exists(fasttext_model_path): |
| raise FileNotFoundError("fastText 模型文件不存在,请检查路径。") |
| print("加载 fastText 模型中...") |
| ft_model = fasttext.load_model(fasttext_model_path) |
| print("模型加载完成 ✅") |
| |
| # === ✅ 用户标签行为矩阵构建 === |
| def get_user_tag_matrix(): |
| df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine) |
| print(df) |
| df['user_id'] = df['user_id'].astype(str) |
| user_map = {u: i for i, u in enumerate(df['user_id'].unique())} |
| tag_map = {t: i for i, t in enumerate(df['tag'].unique())} |
| df['user_index'] = df['user_id'].map(user_map) |
| df['tag_index'] = df['tag'].map(tag_map) |
| matrix = df.pivot_table(index='user_id', columns='tag', values='score', fill_value=0) |
| sparse_matrix = coo_matrix((df['score'], (df['tag_index'], df['user_index']))) |
| return df, matrix, sparse_matrix, user_map, tag_map |
| |
| # === ✅ 基于 fastText 的语义相似推荐方法 === |
| def semantic_recommend(user_id, topn=5): |
| print(f"正在为用户 {user_id} 生成推荐...") |
| |
| # 读取数据库中的用户标签数据 |
| df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine) |
| print(f"总记录数: {len(df)}") |
| print(f"数据示例:\n{df.head()}") |
| print(df.dtypes) |
| user_id = str(user_id) # 确保匹配 |
| |
| # 获取该用户的所有标签(按分数从高到低排序) |
| user_tags = df[df['user_id'] == user_id].sort_values(by="score", ascending=False)['tag'].tolist() |
| print(f"用户 {user_id} 的标签(按分数排序): {user_tags}") |
| |
| if not user_tags: |
| print(f"用户 {user_id} 没有标签记录,返回空推荐结果。") |
| return [] |
| |
| # 截取前 3 个标签作为“兴趣标签” |
| user_tags = user_tags[:3] |
| print(f"用户 {user_id} 的 Top 3 标签: {user_tags}") |
| |
| # 构造所有标签的词向量 |
| all_tags = df['tag'].unique() |
| print(f"所有唯一标签数量: {len(all_tags)}") |
| |
| tag_vectors = {} |
| for tag in all_tags: |
| vec = ft_model.get_word_vector(tag) |
| tag_vectors[tag] = vec |
| |
| # 计算未出现过标签的相似度得分 |
| scores = {} |
| for tag in all_tags: |
| if tag in user_tags: |
| continue |
| vec = tag_vectors[tag] |
| sim_total = 0.0 |
| for t in user_tags: |
| sim = cosine_similarity([vec], [ft_model.get_word_vector(t)])[0][0] |
| print(f"标签 [{tag}] 与用户标签 [{t}] 的相似度: {sim:.4f}") |
| sim_total += sim |
| avg_score = sim_total / len(user_tags) |
| scores[tag] = avg_score |
| print(f"标签 [{tag}] 的平均相似度得分: {avg_score:.4f}") |
| |
| # 排序并返回 topN 标签 |
| sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topn] |
| print(f"\n最终推荐标签(前 {topn}):") |
| for tag, score in sorted_tags: |
| print(f"{tag}: {score:.4f}") |
| |
| return [tag for tag, _ in sorted_tags] |
| |
| # === ✅ ItemCF 推荐方法 === |
| import os |
| import pickle |
| |
| def itemcf_recommend(user_id, matrix, sim_path="./models/itemcf_sim.pkl", topn=5): |
| user_id = str(user_id) # 确保 user_id 类型一致 |
| print(matrix.index.dtype) |
| print(type(user_id)) # 应该是 str |
| |
| if user_id not in matrix.index: |
| print(f"⚠️ 用户 {user_id} 不在评分矩阵中。") |
| return [] |
| |
| if not os.path.exists(sim_path): |
| print(f"⚠️ 用户 {user_id} 不在评分矩阵中。") |
| train_and_save_itemcf() |
| |
| with open(sim_path, "rb") as f: |
| sim_df = pickle.load(f) |
| |
| user_row = matrix.loc[user_id] |
| user_tags = user_row[user_row > 0] |
| |
| if user_tags.empty: |
| print(f"⚠️ 用户 {user_id} 没有任何标签评分记录。") |
| return [] |
| |
| print(f"用户 {user_id} 的标签评分:\n{user_tags}") |
| |
| scores = {} |
| for tag, val in user_tags.items(): |
| if tag not in sim_df: |
| print(f"标签 {tag} 在相似度矩阵中不存在,跳过。") |
| continue |
| sims = sim_df[tag].drop(index=user_tags.index, errors="ignore") |
| for sim_tag, sim_score in sims.items(): |
| scores[sim_tag] = scores.get(sim_tag, 0) + sim_score * val |
| |
| if not scores: |
| print(f"⚠️ 用户 {user_id} 无法生成推荐,可能是标签相似度不足。") |
| return [] |
| |
| sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True) |
| print(f"推荐得分(前{topn}):\n", sorted_tags[:topn]) |
| |
| return [tag for tag, _ in sorted_tags[:topn]] |
| |
| |
| # === ✅ ItemCF 相似度训练 === |
| def train_and_save_itemcf(path="./models/itemcf_sim.pkl"): |
| _, matrix, _, _, _ = get_user_tag_matrix() |
| tag_sim = cosine_similarity(matrix.T) |
| sim_df = pd.DataFrame(tag_sim, index=matrix.columns, columns=matrix.columns) |
| with open(path, "wb") as f: |
| pickle.dump(sim_df, f) |
| print("ItemCF 相似度矩阵已保存 ✅") |
| |
| # === ✅ Flask 推荐接口 === |
| import random |
| |
| @app.route("/recommend_torrents", methods=["POST"]) |
| def recommend_torrents(): |
| data = request.get_json() |
| user_id = data.get("user_id") |
| |
| if not user_id: |
| return jsonify({"error": "缺少 user_id"}), 400 |
| |
| df, matrix, _, _, _ = get_user_tag_matrix() |
| |
| # 获取推荐标签 |
| itemcf_result = itemcf_recommend(user_id, matrix) |
| semantic_result = semantic_recommend(user_id) |
| |
| |
| print(f"ItemCF 推荐标签: {itemcf_result}") |
| print(f"Semantic 推荐标签: {semantic_result}") |
| |
| all_tags = df['tag'].unique().tolist() |
| |
| # 存储标签及其推荐得分 |
| combined = [] |
| used_tags = set() |
| |
| def add_unique_tags(tags, method_name): |
| for tag in tags: |
| if tag not in used_tags: |
| random_priority = random.uniform(0, 1) |
| if method_name == 'ItemCF': |
| combined.append((tag, 'ItemCF', random_priority)) |
| elif method_name == 'Semantic': |
| combined.append((tag, 'Semantic', random_priority)) |
| used_tags.add(tag) |
| |
| # 添加 ItemCF 和 Semantic 推荐 |
| add_unique_tags(itemcf_result, 'ItemCF') |
| add_unique_tags(semantic_result, 'Semantic') |
| |
| # 添加随机标签 |
| random.shuffle(all_tags) |
| add_unique_tags(all_tags, 'Random') |
| |
| # 排序:按推荐得分排序,加入的随机值也会影响排序 |
| combined.sort(key=lambda x: x[2], reverse=True) |
| |
| # 根据标签获取种子 ID |
| final_tags = [tag for tag, _, _ in combined] |
| print(f"最终推荐标签: {final_tags}") |
| torrent_ids = get_torrent_ids_by_tags(final_tags) |
| |
| return jsonify({"torrent_ids": torrent_ids}) |
| |
| |
| |
| from sqlalchemy.sql import text |
| |
| import random |
| from sqlalchemy import text |
| |
| def get_torrent_ids_by_tags(tags, limit_per_tag=10): |
| if not tags: |
| tags = [] |
| |
| recommended_ids = set() |
| with engine.connect() as conn: |
| for tag in tags: |
| query = text(""" |
| SELECT torrent_id |
| FROM bt_torrent_tags |
| WHERE tag = :tag |
| LIMIT :limit |
| """) |
| result = conn.execute(query, {"tag": tag, "limit": limit_per_tag}) |
| for row in result: |
| recommended_ids.add(row[0]) |
| |
| # 获取数据库中所有 torrent_id |
| all_query = text("SELECT DISTINCT torrent_id FROM bt_torrent_tags") |
| all_result = conn.execute(all_query) |
| all_ids = set(row[0] for row in all_result) |
| |
| # 剩下的(非推荐)种子 ID |
| remaining_ids = all_ids - recommended_ids |
| |
| # 随机打乱推荐和剩下的 ID |
| recommended_list = list(recommended_ids) |
| remaining_list = list(remaining_ids) |
| random.shuffle(recommended_list) |
| random.shuffle(remaining_list) |
| |
| return recommended_list + remaining_list |
| |
| |
| # === ✅ 启动服务 === |
| if __name__ == '__main__': |
| train_and_save_itemcf() |
| from waitress import serve |
| serve(app, host="0.0.0.0", port=5000, threads=16) |