推荐系统 Change-Id: I49b9205568f1ccf88b32b08511aff8b0bea8d1bd

commit: d7895173912a5f962183873a9fecdd03b4c4e2ed [log] [tgz]
author: Raver <22301129@bjtu.edu.cn> Wed Jun 18 17:54:38 2025 +0800
committer: Raver <22301129@bjtu.edu.cn> Wed Jun 18 17:54:38 2025 +0800
tree: 831bab5ff5b8f0919e401f24f80e86408d967152
parent: afc93da7b6b13985140ef318c32ddd1318cbae16 [diff] [blame]
diff --git a/rhj/backend/app/utils/graph_build.py b/rhj/backend/app/utils/graph_build.py
new file mode 100644
index 0000000..a453e4e
--- /dev/null
+++ b/rhj/backend/app/utils/graph_build.py

@@ -0,0 +1,115 @@
+import pymysql
+import datetime
+from collections import defaultdict
+
+SqlURL = "10.126.59.25"
+SqlPort = 3306
+Database = "redbook"  # 修改为redbook数据库
+SqlUsername = "root"
+SqlPassword = "123456"
+
+
+def fetch_user_post_data():
+    """
+    从redbook数据库的behaviors表获取用户-帖子交互数据，只包含已发布的帖子
+    """
+    conn = pymysql.connect(
+        host=SqlURL,
+        port=SqlPort,
+        user=SqlUsername,
+        password=SqlPassword,
+        database=Database,
+        charset="utf8mb4"
+    )
+    cursor = conn.cursor()
+    # 获取用户行为数据，只包含已发布帖子的行为数据
+    cursor.execute("""
+        SELECT b.user_id, b.post_id, b.type, b.value, b.created_at 
+        FROM behaviors b
+        INNER JOIN posts p ON b.post_id = p.id
+        WHERE b.type IN ('like', 'favorite', 'comment', 'view', 'share')
+        AND p.status = 'published'
+        ORDER BY b.created_at
+    """)
+    behavior_rows = cursor.fetchall()
+    cursor.close()
+    conn.close()
+    return behavior_rows
+
+
+def process_records(behavior_rows):
+    """
+    处理用户行为记录，为不同类型的行为分配权重
+    """
+    records = []
+    user_set = set()
+    post_set = set()
+    
+    # 为不同行为类型分配权重
+    behavior_weights = {
+        'view': 1,
+        'like': 2,
+        'comment': 3,
+        'share': 4,
+        'favorite': 5
+    }
+    
+    for row in behavior_rows:
+        user_id, post_id, behavior_type, value, created_at = row
+        user_set.add(user_id)
+        post_set.add(post_id)
+        
+        if isinstance(created_at, datetime.datetime):
+            ts = int(created_at.timestamp())
+        else:
+            ts = 0
+            
+        # 使用行为权重
+        weight = behavior_weights.get(behavior_type, 1) * (value or 1)
+        records.append((user_id, post_id, ts, weight))
+    
+    return records, user_set, post_set
+
+
+def build_id_maps(user_set, post_set):
+    """
+    构建用户和帖子的ID映射
+    """
+    user2idx = {uid: idx for idx, uid in enumerate(sorted(user_set))}
+    post2idx = {pid: idx for idx, pid in enumerate(sorted(post_set))}
+    return user2idx, post2idx
+
+
+def group_and_write(records, user2idx, post2idx, output_path="./app/user_post_graph.txt"):
+    """
+    将记录按用户分组并写入文件，支持行为权重
+    """
+    user_items = defaultdict(list)
+    user_times = defaultdict(list)
+    user_weights = defaultdict(list)
+    
+    for user_id, post_id, ts, weight in records:
+        uid = user2idx[user_id]
+        pid = post2idx[post_id]
+        user_items[uid].append(pid)
+        user_times[uid].append(ts)
+        user_weights[uid].append(weight)
+    
+    with open(output_path, "w", encoding="utf-8") as f:
+        for uid in sorted(user_items.keys()):
+            items = " ".join(str(item) for item in user_items[uid])
+            times = " ".join(str(t) for t in user_times[uid])
+            weights = " ".join(str(w) for w in user_weights[uid])
+            f.write(f"{uid}\t{items}\t{times}\t{weights}\n")
+
+
+def build_user_post_graph(return_mapping=False):
+    """
+    构建用户-帖子交互图
+    """
+    behavior_rows = fetch_user_post_data()
+    records, user_set, post_set = process_records(behavior_rows)
+    user2idx, post2idx = build_id_maps(user_set, post_set)
+    group_and_write(records, user2idx, post2idx)
+    if return_mapping:
+        return user2idx, post2idx
\ No newline at end of file
commit	d7895173912a5f962183873a9fecdd03b4c4e2ed	[log] [tgz]
author	Raver <22301129@bjtu.edu.cn>	Wed Jun 18 17:54:38 2025 +0800
committer	Raver <22301129@bjtu.edu.cn>	Wed Jun 18 17:54:38 2025 +0800
tree	831bab5ff5b8f0919e401f24f80e86408d967152
parent	afc93da7b6b13985140ef318c32ddd1318cbae16 [diff] [blame]