Blame - recommend/recommend.py - G3Backend

blob: 25032a02b5ddc215fe3fe3255785957c63cd4a6a [file] [log] [blame]

22301110	f2e3c09	2025-06-05 01:24:43 +0800	[diff] [blame^]	1	import os
				2	import time
				3	import jieba
				4	import fasttext
				5	import pandas as pd
				6	from flask import Flask, request, jsonify
				7	from sqlalchemy import create_engine
				8	from scipy.sparse import coo_matrix
				9	from sklearn.metrics.pairwise import cosine_similarity
				10	import pickle
				11
				12	app = Flask(__name__)
				13
				14	# === ✅ SQLAlchemy 数据库连接 ===
				15	engine = create_engine("mysql+pymysql://sy:sy_password@49.233.215.144:3306/pt_station")
				16
				17	# === ✅ 加载 fastText 模型 ===
				18	fasttext_model_path = 'E:\\course\\pt\\recommend\\models\\cc.zh.300.bin'
				19	if not os.path.exists(fasttext_model_path):
				20	raise FileNotFoundError("fastText 模型文件不存在，请检查路径。")
				21	print("加载 fastText 模型中...")
				22	ft_model = fasttext.load_model(fasttext_model_path)
				23	print("模型加载完成 ✅")
				24
				25	# === ✅ 用户标签行为矩阵构建 ===
				26	def get_user_tag_matrix():
				27	df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine)
				28	print(df)
				29	df['user_id'] = df['user_id'].astype(str)
				30	user_map = {u: i for i, u in enumerate(df['user_id'].unique())}
				31	tag_map = {t: i for i, t in enumerate(df['tag'].unique())}
				32	df['user_index'] = df['user_id'].map(user_map)
				33	df['tag_index'] = df['tag'].map(tag_map)
				34	matrix = df.pivot_table(index='user_id', columns='tag', values='score', fill_value=0)
				35	sparse_matrix = coo_matrix((df['score'], (df['tag_index'], df['user_index'])))
				36	return df, matrix, sparse_matrix, user_map, tag_map
				37
				38	# === ✅ 基于 fastText 的语义相似推荐方法 ===
				39	def semantic_recommend(user_id, topn=5):
				40	print(f"正在为用户 {user_id} 生成推荐...")
				41
				42	# 读取数据库中的用户标签数据
				43	df = pd.read_sql("SELECT user_id, tag, score FROM user_tag_scores", engine)
				44	print(f"总记录数: {len(df)}")
				45	print(f"数据示例:\n{df.head()}")
				46	print(df.dtypes)
				47	user_id = str(user_id) # 确保匹配
				48
				49	# 获取该用户的所有标签（按分数从高到低排序）
				50	user_tags = df[df['user_id'] == user_id].sort_values(by="score", ascending=False)['tag'].tolist()
				51	print(f"用户 {user_id} 的标签（按分数排序）: {user_tags}")
				52
				53	if not user_tags:
				54	print(f"用户 {user_id} 没有标签记录，返回空推荐结果。")
				55	return []
				56
				57	# 截取前 3 个标签作为“兴趣标签”
				58	user_tags = user_tags[:3]
				59	print(f"用户 {user_id} 的 Top 3 标签: {user_tags}")
				60
				61	# 构造所有标签的词向量
				62	all_tags = df['tag'].unique()
				63	print(f"所有唯一标签数量: {len(all_tags)}")
				64
				65	tag_vectors = {}
				66	for tag in all_tags:
				67	vec = ft_model.get_word_vector(tag)
				68	tag_vectors[tag] = vec
				69
				70	# 计算未出现过标签的相似度得分
				71	scores = {}
				72	for tag in all_tags:
				73	if tag in user_tags:
				74	continue
				75	vec = tag_vectors[tag]
				76	sim_total = 0.0
				77	for t in user_tags:
				78	sim = cosine_similarity([vec], [ft_model.get_word_vector(t)])[0][0]
				79	print(f"标签 [{tag}] 与用户标签 [{t}] 的相似度: {sim:.4f}")
				80	sim_total += sim
				81	avg_score = sim_total / len(user_tags)
				82	scores[tag] = avg_score
				83	print(f"标签 [{tag}] 的平均相似度得分: {avg_score:.4f}")
				84
				85	# 排序并返回 topN 标签
				86	sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topn]
				87	print(f"\n最终推荐标签（前 {topn}）:")
				88	for tag, score in sorted_tags:
				89	print(f"{tag}: {score:.4f}")
				90
				91	return [tag for tag, _ in sorted_tags]
				92
				93	# === ✅ ItemCF 推荐方法 ===
				94	import os
				95	import pickle
				96
				97	def itemcf_recommend(user_id, matrix, sim_path="./models/itemcf_sim.pkl", topn=5):
				98	user_id = str(user_id) # 确保 user_id 类型一致
				99	print(matrix.index.dtype)
				100	print(type(user_id)) # 应该是 str
				101
				102	if user_id not in matrix.index:
				103	print(f"⚠️ 用户 {user_id} 不在评分矩阵中。")
				104	return []
				105
				106	if not os.path.exists(sim_path):
				107	print(f"⚠️ 用户 {user_id} 不在评分矩阵中。")
				108	train_and_save_itemcf()
				109
				110	with open(sim_path, "rb") as f:
				111	sim_df = pickle.load(f)
				112
				113	user_row = matrix.loc[user_id]
				114	user_tags = user_row[user_row > 0]
				115
				116	if user_tags.empty:
				117	print(f"⚠️ 用户 {user_id} 没有任何标签评分记录。")
				118	return []
				119
				120	print(f"用户 {user_id} 的标签评分:\n{user_tags}")
				121
				122	scores = {}
				123	for tag, val in user_tags.items():
				124	if tag not in sim_df:
				125	print(f"标签 {tag} 在相似度矩阵中不存在，跳过。")
				126	continue
				127	sims = sim_df[tag].drop(index=user_tags.index, errors="ignore")
				128	for sim_tag, sim_score in sims.items():
				129	scores[sim_tag] = scores.get(sim_tag, 0) + sim_score * val
				130
				131	if not scores:
				132	print(f"⚠️ 用户 {user_id} 无法生成推荐，可能是标签相似度不足。")
				133	return []
				134
				135	sorted_tags = sorted(scores.items(), key=lambda x: x[1], reverse=True)
				136	print(f"推荐得分（前{topn}）:\n", sorted_tags[:topn])
				137
				138	return [tag for tag, _ in sorted_tags[:topn]]
				139
				140
				141	# === ✅ ItemCF 相似度训练 ===
				142	def train_and_save_itemcf(path="./models/itemcf_sim.pkl"):
				143	_, matrix, _, _, _ = get_user_tag_matrix()
				144	tag_sim = cosine_similarity(matrix.T)
				145	sim_df = pd.DataFrame(tag_sim, index=matrix.columns, columns=matrix.columns)
				146	with open(path, "wb") as f:
				147	pickle.dump(sim_df, f)
				148	print("ItemCF 相似度矩阵已保存 ✅")
				149
				150	# === ✅ Flask 推荐接口 ===
				151	import random
				152
				153	@app.route("/recommend_torrents", methods=["POST"])
				154	def recommend_torrents():
				155	data = request.get_json()
				156	user_id = data.get("user_id")
				157
				158	if not user_id:
				159	return jsonify({"error": "缺少 user_id"}), 400
				160
				161	df, matrix, _, _, _ = get_user_tag_matrix()
				162
				163	# 获取推荐标签
				164	itemcf_result = itemcf_recommend(user_id, matrix)
				165	semantic_result = semantic_recommend(user_id)
				166
				167
				168	print(f"ItemCF 推荐标签: {itemcf_result}")
				169	print(f"Semantic 推荐标签: {semantic_result}")
				170
				171	all_tags = df['tag'].unique().tolist()
				172
				173	# 存储标签及其推荐得分
				174	combined = []
				175	used_tags = set()
				176
				177	def add_unique_tags(tags, method_name):
				178	for tag in tags:
				179	if tag not in used_tags:
				180	random_priority = random.uniform(0, 1)
				181	if method_name == 'ItemCF':
				182	combined.append((tag, 'ItemCF', random_priority))
				183	elif method_name == 'Semantic':
				184	combined.append((tag, 'Semantic', random_priority))
				185	used_tags.add(tag)
				186
				187	# 添加 ItemCF 和 Semantic 推荐
				188	add_unique_tags(itemcf_result, 'ItemCF')
				189	add_unique_tags(semantic_result, 'Semantic')
				190
				191	# 添加随机标签
				192	random.shuffle(all_tags)
				193	add_unique_tags(all_tags, 'Random')
				194
				195	# 排序：按推荐得分排序，加入的随机值也会影响排序
				196	combined.sort(key=lambda x: x[2], reverse=True)
				197
				198	# 根据标签获取种子 ID
				199	final_tags = [tag for tag, _, _ in combined]
				200	print(f"最终推荐标签: {final_tags}")
				201	torrent_ids = get_torrent_ids_by_tags(final_tags)
				202
				203	return jsonify({"torrent_ids": torrent_ids})
				204
				205
				206
				207	from sqlalchemy.sql import text
				208
				209	import random
				210	from sqlalchemy import text
				211
				212	def get_torrent_ids_by_tags(tags, limit_per_tag=10):
				213	if not tags:
				214	tags = []
				215
				216	recommended_ids = set()
				217	with engine.connect() as conn:
				218	for tag in tags:
				219	query = text("""
				220	SELECT torrent_id
				221	FROM bt_torrent_tags
				222	WHERE tag = :tag
				223	LIMIT :limit
				224	""")
				225	result = conn.execute(query, {"tag": tag, "limit": limit_per_tag})
				226	for row in result:
				227	recommended_ids.add(row[0])
				228
				229	# 获取数据库中所有 torrent_id
				230	all_query = text("SELECT DISTINCT torrent_id FROM bt_torrent_tags")
				231	all_result = conn.execute(all_query)
				232	all_ids = set(row[0] for row in all_result)
				233
				234	# 剩下的（非推荐）种子 ID
				235	remaining_ids = all_ids - recommended_ids
				236
				237	# 随机打乱推荐和剩下的 ID
				238	recommended_list = list(recommended_ids)
				239	remaining_list = list(remaining_ids)
				240	random.shuffle(recommended_list)
				241	random.shuffle(remaining_list)
				242
				243	return recommended_list + remaining_list
				244
				245
				246	# === ✅ 启动服务 ===
				247	if __name__ == '__main__':
				248	train_and_save_itemcf()
				249	from waitress import serve
				250	serve(app, host="0.0.0.0", port=5000, threads=16)