blob: a453e4e90fb40b4318c2f56f2d137ee2e5e8e866 [file] [log] [blame]
Raverd7895172025-06-18 17:54:38 +08001import pymysql
2import datetime
3from collections import defaultdict
4
5SqlURL = "10.126.59.25"
6SqlPort = 3306
7Database = "redbook" # 修改为redbook数据库
8SqlUsername = "root"
9SqlPassword = "123456"
10
11
12def fetch_user_post_data():
13 """
14 从redbook数据库的behaviors表获取用户-帖子交互数据,只包含已发布的帖子
15 """
16 conn = pymysql.connect(
17 host=SqlURL,
18 port=SqlPort,
19 user=SqlUsername,
20 password=SqlPassword,
21 database=Database,
22 charset="utf8mb4"
23 )
24 cursor = conn.cursor()
25 # 获取用户行为数据,只包含已发布帖子的行为数据
26 cursor.execute("""
27 SELECT b.user_id, b.post_id, b.type, b.value, b.created_at
28 FROM behaviors b
29 INNER JOIN posts p ON b.post_id = p.id
30 WHERE b.type IN ('like', 'favorite', 'comment', 'view', 'share')
31 AND p.status = 'published'
32 ORDER BY b.created_at
33 """)
34 behavior_rows = cursor.fetchall()
35 cursor.close()
36 conn.close()
37 return behavior_rows
38
39
40def process_records(behavior_rows):
41 """
42 处理用户行为记录,为不同类型的行为分配权重
43 """
44 records = []
45 user_set = set()
46 post_set = set()
47
48 # 为不同行为类型分配权重
49 behavior_weights = {
50 'view': 1,
51 'like': 2,
52 'comment': 3,
53 'share': 4,
54 'favorite': 5
55 }
56
57 for row in behavior_rows:
58 user_id, post_id, behavior_type, value, created_at = row
59 user_set.add(user_id)
60 post_set.add(post_id)
61
62 if isinstance(created_at, datetime.datetime):
63 ts = int(created_at.timestamp())
64 else:
65 ts = 0
66
67 # 使用行为权重
68 weight = behavior_weights.get(behavior_type, 1) * (value or 1)
69 records.append((user_id, post_id, ts, weight))
70
71 return records, user_set, post_set
72
73
74def build_id_maps(user_set, post_set):
75 """
76 构建用户和帖子的ID映射
77 """
78 user2idx = {uid: idx for idx, uid in enumerate(sorted(user_set))}
79 post2idx = {pid: idx for idx, pid in enumerate(sorted(post_set))}
80 return user2idx, post2idx
81
82
83def group_and_write(records, user2idx, post2idx, output_path="./app/user_post_graph.txt"):
84 """
85 将记录按用户分组并写入文件,支持行为权重
86 """
87 user_items = defaultdict(list)
88 user_times = defaultdict(list)
89 user_weights = defaultdict(list)
90
91 for user_id, post_id, ts, weight in records:
92 uid = user2idx[user_id]
93 pid = post2idx[post_id]
94 user_items[uid].append(pid)
95 user_times[uid].append(ts)
96 user_weights[uid].append(weight)
97
98 with open(output_path, "w", encoding="utf-8") as f:
99 for uid in sorted(user_items.keys()):
100 items = " ".join(str(item) for item in user_items[uid])
101 times = " ".join(str(t) for t in user_times[uid])
102 weights = " ".join(str(w) for w in user_weights[uid])
103 f.write(f"{uid}\t{items}\t{times}\t{weights}\n")
104
105
106def build_user_post_graph(return_mapping=False):
107 """
108 构建用户-帖子交互图
109 """
110 behavior_rows = fetch_user_post_data()
111 records, user_set, post_set = process_records(behavior_rows)
112 user2idx, post2idx = build_id_maps(user_set, post_set)
113 group_and_write(records, user2idx, post2idx)
114 if return_mapping:
115 return user2idx, post2idx