blob: b216d52eb64098fc95b76cb4ff1984ea61bc91f7 [file] [log] [blame]
22301110f2e3c092025-06-05 01:24:43 +08001import os
2import time
3import jieba
4import fasttext
5import pandas as pd
6from flask import Flask, request, jsonify
7from sqlalchemy import create_engine
8from scipy.sparse import coo_matrix
9from sklearn.metrics.pairwise import cosine_similarity
10import pickle
11
12app = Flask(__name__)
13
14# === ✅ SQLAlchemy 数据库连接 ===
15engine = create_engine("mysql+pymysql://sy:sy_password@49.233.215.144:3306/pt_station")
16
17# === ✅ 加载 fastText 模型 ===
Atopos0524878db002025-06-08 22:36:57 +080018fasttext_model_path = 'models\\cc.zh.300.bin'
22301110f2e3c092025-06-05 01:24:43 +080019if not os.path.exists(fasttext_model_path):
20 raise FileNotFoundError("fastText 模型文件不存在,请检查路径。")
21print("加载 fastText 模型中...")
22ft_model = fasttext.load_model(fasttext_model_path)
23print("模型加载完成 ✅")
24
25# === ✅ 用户标签行为矩阵构建 ===
26def get_user_tag_matrix():
27 df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine)
Atopos0524878db002025-06-08 22:36:57 +080028 #print(df)
22301110f2e3c092025-06-05 01:24:43 +080029 df['user_id'] = df['user_id'].astype(str)
30 user_map = {u: i for i, u in enumerate(df['user_id'].unique())}
31 tag_map = {t: i for i, t in enumerate(df['tag'].unique())}
32 df['user_index'] = df['user_id'].map(user_map)
33 df['tag_index'] = df['tag'].map(tag_map)
34 matrix = df.pivot_table(index='user_id', columns='tag', values='score', fill_value=0)
35 sparse_matrix = coo_matrix((df['score'], (df['tag_index'], df['user_index'])))
36 return df, matrix, sparse_matrix, user_map, tag_map
37
38# === ✅ 基于 fastText 的语义相似推荐方法 ===
39def semantic_recommend(user_id, topn=5):
40 print(f"正在为用户 {user_id} 生成推荐...")
41
Atopos0524878db002025-06-08 22:36:57 +080042 # 读取数据
22301110f2e3c092025-06-05 01:24:43 +080043 df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine)
22301110f2e3c092025-06-05 01:24:43 +080044
Atopos0524878db002025-06-08 22:36:57 +080045 # 统一类型转换
46 df['user_id'] = df['user_id'].astype(str) # 确保整个列转为字符串
47 user_id = str(user_id) # 要查询的ID也转为字符串
48
49 # 现在查询应该正常工作了
22301110f2e3c092025-06-05 01:24:43 +080050 user_tags = df[df['user_id'] == user_id].sort_values(by="score", ascending=False)['tag'].tolist()
51 print(f"用户 {user_id} 的标签(按分数排序): {user_tags}")
52
53 if not user_tags:
54 print(f"用户 {user_id} 没有标签记录,返回空推荐结果。")
55 return []
Atopos0524878db002025-06-08 22:36:57 +080056 else:
57 user_tags = user_tags[:3]
58 print(f"用户 {user_id} 的 Top 3 标签: {user_tags}")
59
60 if not user_tags:
61 print(f"用户 {user_id} 没有标签记录,返回空推荐结果。")
62 return []
22301110f2e3c092025-06-05 01:24:43 +080063
64 # 截取前 3 个标签作为“兴趣标签”
65 user_tags = user_tags[:3]
66 print(f"用户 {user_id} 的 Top 3 标签: {user_tags}")
67
68 # 构造所有标签的词向量
69 all_tags = df['tag'].unique()
70 print(f"所有唯一标签数量: {len(all_tags)}")
71
72 tag_vectors = {}
73 for tag in all_tags:
74 vec = ft_model.get_word_vector(tag)
75 tag_vectors[tag] = vec
76
77 # 计算未出现过标签的相似度得分
78 scores = {}
79 for tag in all_tags:
80 if tag in user_tags:
81 continue
82 vec = tag_vectors[tag]
83 sim_total = 0.0
84 for t in user_tags:
85 sim = cosine_similarity([vec], [ft_model.get_word_vector(t)])[0][0]
86 print(f"标签 [{tag}] 与用户标签 [{t}] 的相似度: {sim:.4f}")
87 sim_total += sim
88 avg_score = sim_total / len(user_tags)
89 scores[tag] = avg_score
90 print(f"标签 [{tag}] 的平均相似度得分: {avg_score:.4f}")
91
92 # 排序并返回 topN 标签
93 sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topn]
94 print(f"\n最终推荐标签(前 {topn}):")
Atopos0524878db002025-06-08 22:36:57 +080095 #for tag, score in sorted_tags:
96 # print(f"{tag}: {score:.4f}")
22301110f2e3c092025-06-05 01:24:43 +080097
98 return [tag for tag, _ in sorted_tags]
99
100# === ✅ ItemCF 推荐方法 ===
101import os
102import pickle
103
104def itemcf_recommend(user_id, matrix, sim_path="./models/itemcf_sim.pkl", topn=5):
105 user_id = str(user_id) # 确保 user_id 类型一致
106 print(matrix.index.dtype)
107 print(type(user_id)) # 应该是 str
108
109 if user_id not in matrix.index:
110 print(f"⚠️ 用户 {user_id} 不在评分矩阵中。")
111 return []
112
113 if not os.path.exists(sim_path):
114 print(f"⚠️ 用户 {user_id} 不在评分矩阵中。")
115 train_and_save_itemcf()
116
117 with open(sim_path, "rb") as f:
118 sim_df = pickle.load(f)
119
120 user_row = matrix.loc[user_id]
121 user_tags = user_row[user_row > 0]
122
123 if user_tags.empty:
124 print(f"⚠️ 用户 {user_id} 没有任何标签评分记录。")
125 return []
126
22301110f2e3c092025-06-05 01:24:43 +0800127 scores = {}
128 for tag, val in user_tags.items():
129 if tag not in sim_df:
130 print(f"标签 {tag} 在相似度矩阵中不存在,跳过。")
131 continue
132 sims = sim_df[tag].drop(index=user_tags.index, errors="ignore")
133 for sim_tag, sim_score in sims.items():
134 scores[sim_tag] = scores.get(sim_tag, 0) + sim_score * val
135
136 if not scores:
137 print(f"⚠️ 用户 {user_id} 无法生成推荐,可能是标签相似度不足。")
138 return []
139
140 sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True)
141 print(f"推荐得分(前{topn}):\n", sorted_tags[:topn])
142
143 return [tag for tag, _ in sorted_tags[:topn]]
144
145
146# === ✅ ItemCF 相似度训练 ===
147def train_and_save_itemcf(path="./models/itemcf_sim.pkl"):
148 _, matrix, _, _, _ = get_user_tag_matrix()
149 tag_sim = cosine_similarity(matrix.T)
150 sim_df = pd.DataFrame(tag_sim, index=matrix.columns, columns=matrix.columns)
151 with open(path, "wb") as f:
152 pickle.dump(sim_df, f)
153 print("ItemCF 相似度矩阵已保存 ✅")
154
155# === ✅ Flask 推荐接口 ===
156import random
157
158@app.route("/recommend_torrents", methods=["POST"])
159def recommend_torrents():
160 data = request.get_json()
161 user_id = data.get("user_id")
162
163 if not user_id:
164 return jsonify({"error": "缺少 user_id"}), 400
165
166 df, matrix, _, _, _ = get_user_tag_matrix()
167
168 # 获取推荐标签
169 itemcf_result = itemcf_recommend(user_id, matrix)
170 semantic_result = semantic_recommend(user_id)
171
172
173 print(f"ItemCF 推荐标签: {itemcf_result}")
174 print(f"Semantic 推荐标签: {semantic_result}")
175
176 all_tags = df['tag'].unique().tolist()
177
178 # 存储标签及其推荐得分
179 combined = []
180 used_tags = set()
181
182 def add_unique_tags(tags, method_name):
183 for tag in tags:
184 if tag not in used_tags:
185 random_priority = random.uniform(0, 1)
186 if method_name == 'ItemCF':
187 combined.append((tag, 'ItemCF', random_priority))
188 elif method_name == 'Semantic':
189 combined.append((tag, 'Semantic', random_priority))
190 used_tags.add(tag)
191
192 # 添加 ItemCF 和 Semantic 推荐
193 add_unique_tags(itemcf_result, 'ItemCF')
194 add_unique_tags(semantic_result, 'Semantic')
195
196 # 添加随机标签
197 random.shuffle(all_tags)
198 add_unique_tags(all_tags, 'Random')
199
200 # 排序:按推荐得分排序,加入的随机值也会影响排序
201 combined.sort(key=lambda x: x[2], reverse=True)
202
203 # 根据标签获取种子 ID
204 final_tags = [tag for tag, _, _ in combined]
205 print(f"最终推荐标签: {final_tags}")
206 torrent_ids = get_torrent_ids_by_tags(final_tags)
207
208 return jsonify({"torrent_ids": torrent_ids})
209
210
211
212from sqlalchemy.sql import text
213
214import random
215from sqlalchemy import text
216
217def get_torrent_ids_by_tags(tags, limit_per_tag=10):
218 if not tags:
219 tags = []
Atopos0524878db002025-06-08 22:36:57 +0800220 print(f"传递给 get_torrent_ids_by_tags 的标签: {tags}")
22301110f2e3c092025-06-05 01:24:43 +0800221
222 recommended_ids = set()
223 with engine.connect() as conn:
224 for tag in tags:
225 query = text("""
226 SELECT torrent_id
Atopos0524878db002025-06-08 22:36:57 +0800227 FROM bt_torrent_tags
228 WHERE tag = :tag
22301110f2e3c092025-06-05 01:24:43 +0800229 LIMIT :limit
230 """)
231 result = conn.execute(query, {"tag": tag, "limit": limit_per_tag})
Atopos0524878db002025-06-08 22:36:57 +0800232 print(f"标签 '{tag}' 的推荐结果:")
22301110f2e3c092025-06-05 01:24:43 +0800233 for row in result:
Atopos0524878db002025-06-08 22:36:57 +0800234 print(row[0]) # 打印每个torrent_id
22301110f2e3c092025-06-05 01:24:43 +0800235 recommended_ids.add(row[0])
236
237 # 获取数据库中所有 torrent_id
Atopos0524878db002025-06-08 22:36:57 +0800238 all_query = text("SELECT DISTINCT torrent_id FROM bt_torrent")
22301110f2e3c092025-06-05 01:24:43 +0800239 all_result = conn.execute(all_query)
240 all_ids = set(row[0] for row in all_result)
Atopos0524878db002025-06-08 22:36:57 +0800241 print("数据库中所有torrent_id:", all_ids)
22301110f2e3c092025-06-05 01:24:43 +0800242
243 # 剩下的(非推荐)种子 ID
244 remaining_ids = all_ids - recommended_ids
Atopos0524878db002025-06-08 22:36:57 +0800245 print(remaining_ids)
22301110f2e3c092025-06-05 01:24:43 +0800246 # 随机打乱推荐和剩下的 ID
247 recommended_list = list(recommended_ids)
248 remaining_list = list(remaining_ids)
249 random.shuffle(recommended_list)
250 random.shuffle(remaining_list)
251
252 return recommended_list + remaining_list
253
254
255# === ✅ 启动服务 ===
256if __name__ == '__main__':
257 train_and_save_itemcf()
258 from waitress import serve
259 serve(app, host="0.0.0.0", port=5000, threads=16)