blob: 25032a02b5ddc215fe3fe3255785957c63cd4a6a [file] [log] [blame]
22301110f2e3c092025-06-05 01:24:43 +08001import os
2import time
3import jieba
4import fasttext
5import pandas as pd
6from flask import Flask, request, jsonify
7from sqlalchemy import create_engine
8from scipy.sparse import coo_matrix
9from sklearn.metrics.pairwise import cosine_similarity
10import pickle
11
12app = Flask(__name__)
13
14# === ✅ SQLAlchemy 数据库连接 ===
15engine = create_engine("mysql+pymysql://sy:sy_password@49.233.215.144:3306/pt_station")
16
17# === ✅ 加载 fastText 模型 ===
18fasttext_model_path = 'E:\\course\\pt\\recommend\\models\\cc.zh.300.bin'
19if not os.path.exists(fasttext_model_path):
20 raise FileNotFoundError("fastText 模型文件不存在,请检查路径。")
21print("加载 fastText 模型中...")
22ft_model = fasttext.load_model(fasttext_model_path)
23print("模型加载完成 ✅")
24
25# === ✅ 用户标签行为矩阵构建 ===
26def get_user_tag_matrix():
27 df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine)
28 print(df)
29 df['user_id'] = df['user_id'].astype(str)
30 user_map = {u: i for i, u in enumerate(df['user_id'].unique())}
31 tag_map = {t: i for i, t in enumerate(df['tag'].unique())}
32 df['user_index'] = df['user_id'].map(user_map)
33 df['tag_index'] = df['tag'].map(tag_map)
34 matrix = df.pivot_table(index='user_id', columns='tag', values='score', fill_value=0)
35 sparse_matrix = coo_matrix((df['score'], (df['tag_index'], df['user_index'])))
36 return df, matrix, sparse_matrix, user_map, tag_map
37
38# === ✅ 基于 fastText 的语义相似推荐方法 ===
39def semantic_recommend(user_id, topn=5):
40 print(f"正在为用户 {user_id} 生成推荐...")
41
42 # 读取数据库中的用户标签数据
43 df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine)
44 print(f"总记录数: {len(df)}")
45 print(f"数据示例:\n{df.head()}")
46 print(df.dtypes)
47 user_id = str(user_id) # 确保匹配
48
49 # 获取该用户的所有标签(按分数从高到低排序)
50 user_tags = df[df['user_id'] == user_id].sort_values(by="score", ascending=False)['tag'].tolist()
51 print(f"用户 {user_id} 的标签(按分数排序): {user_tags}")
52
53 if not user_tags:
54 print(f"用户 {user_id} 没有标签记录,返回空推荐结果。")
55 return []
56
57 # 截取前 3 个标签作为“兴趣标签”
58 user_tags = user_tags[:3]
59 print(f"用户 {user_id} 的 Top 3 标签: {user_tags}")
60
61 # 构造所有标签的词向量
62 all_tags = df['tag'].unique()
63 print(f"所有唯一标签数量: {len(all_tags)}")
64
65 tag_vectors = {}
66 for tag in all_tags:
67 vec = ft_model.get_word_vector(tag)
68 tag_vectors[tag] = vec
69
70 # 计算未出现过标签的相似度得分
71 scores = {}
72 for tag in all_tags:
73 if tag in user_tags:
74 continue
75 vec = tag_vectors[tag]
76 sim_total = 0.0
77 for t in user_tags:
78 sim = cosine_similarity([vec], [ft_model.get_word_vector(t)])[0][0]
79 print(f"标签 [{tag}] 与用户标签 [{t}] 的相似度: {sim:.4f}")
80 sim_total += sim
81 avg_score = sim_total / len(user_tags)
82 scores[tag] = avg_score
83 print(f"标签 [{tag}] 的平均相似度得分: {avg_score:.4f}")
84
85 # 排序并返回 topN 标签
86 sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topn]
87 print(f"\n最终推荐标签(前 {topn}):")
88 for tag, score in sorted_tags:
89 print(f"{tag}: {score:.4f}")
90
91 return [tag for tag, _ in sorted_tags]
92
93# === ✅ ItemCF 推荐方法 ===
94import os
95import pickle
96
97def itemcf_recommend(user_id, matrix, sim_path="./models/itemcf_sim.pkl", topn=5):
98 user_id = str(user_id) # 确保 user_id 类型一致
99 print(matrix.index.dtype)
100 print(type(user_id)) # 应该是 str
101
102 if user_id not in matrix.index:
103 print(f"⚠️ 用户 {user_id} 不在评分矩阵中。")
104 return []
105
106 if not os.path.exists(sim_path):
107 print(f"⚠️ 用户 {user_id} 不在评分矩阵中。")
108 train_and_save_itemcf()
109
110 with open(sim_path, "rb") as f:
111 sim_df = pickle.load(f)
112
113 user_row = matrix.loc[user_id]
114 user_tags = user_row[user_row > 0]
115
116 if user_tags.empty:
117 print(f"⚠️ 用户 {user_id} 没有任何标签评分记录。")
118 return []
119
120 print(f"用户 {user_id} 的标签评分:\n{user_tags}")
121
122 scores = {}
123 for tag, val in user_tags.items():
124 if tag not in sim_df:
125 print(f"标签 {tag} 在相似度矩阵中不存在,跳过。")
126 continue
127 sims = sim_df[tag].drop(index=user_tags.index, errors="ignore")
128 for sim_tag, sim_score in sims.items():
129 scores[sim_tag] = scores.get(sim_tag, 0) + sim_score * val
130
131 if not scores:
132 print(f"⚠️ 用户 {user_id} 无法生成推荐,可能是标签相似度不足。")
133 return []
134
135 sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True)
136 print(f"推荐得分(前{topn}):\n", sorted_tags[:topn])
137
138 return [tag for tag, _ in sorted_tags[:topn]]
139
140
141# === ✅ ItemCF 相似度训练 ===
142def train_and_save_itemcf(path="./models/itemcf_sim.pkl"):
143 _, matrix, _, _, _ = get_user_tag_matrix()
144 tag_sim = cosine_similarity(matrix.T)
145 sim_df = pd.DataFrame(tag_sim, index=matrix.columns, columns=matrix.columns)
146 with open(path, "wb") as f:
147 pickle.dump(sim_df, f)
148 print("ItemCF 相似度矩阵已保存 ✅")
149
150# === ✅ Flask 推荐接口 ===
151import random
152
153@app.route("/recommend_torrents", methods=["POST"])
154def recommend_torrents():
155 data = request.get_json()
156 user_id = data.get("user_id")
157
158 if not user_id:
159 return jsonify({"error": "缺少 user_id"}), 400
160
161 df, matrix, _, _, _ = get_user_tag_matrix()
162
163 # 获取推荐标签
164 itemcf_result = itemcf_recommend(user_id, matrix)
165 semantic_result = semantic_recommend(user_id)
166
167
168 print(f"ItemCF 推荐标签: {itemcf_result}")
169 print(f"Semantic 推荐标签: {semantic_result}")
170
171 all_tags = df['tag'].unique().tolist()
172
173 # 存储标签及其推荐得分
174 combined = []
175 used_tags = set()
176
177 def add_unique_tags(tags, method_name):
178 for tag in tags:
179 if tag not in used_tags:
180 random_priority = random.uniform(0, 1)
181 if method_name == 'ItemCF':
182 combined.append((tag, 'ItemCF', random_priority))
183 elif method_name == 'Semantic':
184 combined.append((tag, 'Semantic', random_priority))
185 used_tags.add(tag)
186
187 # 添加 ItemCF 和 Semantic 推荐
188 add_unique_tags(itemcf_result, 'ItemCF')
189 add_unique_tags(semantic_result, 'Semantic')
190
191 # 添加随机标签
192 random.shuffle(all_tags)
193 add_unique_tags(all_tags, 'Random')
194
195 # 排序:按推荐得分排序,加入的随机值也会影响排序
196 combined.sort(key=lambda x: x[2], reverse=True)
197
198 # 根据标签获取种子 ID
199 final_tags = [tag for tag, _, _ in combined]
200 print(f"最终推荐标签: {final_tags}")
201 torrent_ids = get_torrent_ids_by_tags(final_tags)
202
203 return jsonify({"torrent_ids": torrent_ids})
204
205
206
207from sqlalchemy.sql import text
208
209import random
210from sqlalchemy import text
211
212def get_torrent_ids_by_tags(tags, limit_per_tag=10):
213 if not tags:
214 tags = []
215
216 recommended_ids = set()
217 with engine.connect() as conn:
218 for tag in tags:
219 query = text("""
220 SELECT torrent_id
221 FROM bt_torrent_tags
222 WHERE tag = :tag
223 LIMIT :limit
224 """)
225 result = conn.execute(query, {"tag": tag, "limit": limit_per_tag})
226 for row in result:
227 recommended_ids.add(row[0])
228
229 # 获取数据库中所有 torrent_id
230 all_query = text("SELECT DISTINCT torrent_id FROM bt_torrent_tags")
231 all_result = conn.execute(all_query)
232 all_ids = set(row[0] for row in all_result)
233
234 # 剩下的(非推荐)种子 ID
235 remaining_ids = all_ids - recommended_ids
236
237 # 随机打乱推荐和剩下的 ID
238 recommended_list = list(recommended_ids)
239 remaining_list = list(remaining_ids)
240 random.shuffle(recommended_list)
241 random.shuffle(remaining_list)
242
243 return recommended_list + remaining_list
244
245
246# === ✅ 启动服务 ===
247if __name__ == '__main__':
248 train_and_save_itemcf()
249 from waitress import serve
250 serve(app, host="0.0.0.0", port=5000, threads=16)