Raver | d789517 | 2025-06-18 17:54:38 +0800 | [diff] [blame^] | 1 | import pymysql |
| 2 | import datetime |
| 3 | from collections import defaultdict |
| 4 | |
| 5 | SqlURL = "10.126.59.25" |
| 6 | SqlPort = 3306 |
| 7 | Database = "redbook" # 修改为redbook数据库 |
| 8 | SqlUsername = "root" |
| 9 | SqlPassword = "123456" |
| 10 | |
| 11 | |
| 12 | def fetch_user_post_data(): |
| 13 | """ |
| 14 | 从redbook数据库的behaviors表获取用户-帖子交互数据,只包含已发布的帖子 |
| 15 | """ |
| 16 | conn = pymysql.connect( |
| 17 | host=SqlURL, |
| 18 | port=SqlPort, |
| 19 | user=SqlUsername, |
| 20 | password=SqlPassword, |
| 21 | database=Database, |
| 22 | charset="utf8mb4" |
| 23 | ) |
| 24 | cursor = conn.cursor() |
| 25 | # 获取用户行为数据,只包含已发布帖子的行为数据 |
| 26 | cursor.execute(""" |
| 27 | SELECT b.user_id, b.post_id, b.type, b.value, b.created_at |
| 28 | FROM behaviors b |
| 29 | INNER JOIN posts p ON b.post_id = p.id |
| 30 | WHERE b.type IN ('like', 'favorite', 'comment', 'view', 'share') |
| 31 | AND p.status = 'published' |
| 32 | ORDER BY b.created_at |
| 33 | """) |
| 34 | behavior_rows = cursor.fetchall() |
| 35 | cursor.close() |
| 36 | conn.close() |
| 37 | return behavior_rows |
| 38 | |
| 39 | |
| 40 | def process_records(behavior_rows): |
| 41 | """ |
| 42 | 处理用户行为记录,为不同类型的行为分配权重 |
| 43 | """ |
| 44 | records = [] |
| 45 | user_set = set() |
| 46 | post_set = set() |
| 47 | |
| 48 | # 为不同行为类型分配权重 |
| 49 | behavior_weights = { |
| 50 | 'view': 1, |
| 51 | 'like': 2, |
| 52 | 'comment': 3, |
| 53 | 'share': 4, |
| 54 | 'favorite': 5 |
| 55 | } |
| 56 | |
| 57 | for row in behavior_rows: |
| 58 | user_id, post_id, behavior_type, value, created_at = row |
| 59 | user_set.add(user_id) |
| 60 | post_set.add(post_id) |
| 61 | |
| 62 | if isinstance(created_at, datetime.datetime): |
| 63 | ts = int(created_at.timestamp()) |
| 64 | else: |
| 65 | ts = 0 |
| 66 | |
| 67 | # 使用行为权重 |
| 68 | weight = behavior_weights.get(behavior_type, 1) * (value or 1) |
| 69 | records.append((user_id, post_id, ts, weight)) |
| 70 | |
| 71 | return records, user_set, post_set |
| 72 | |
| 73 | |
| 74 | def build_id_maps(user_set, post_set): |
| 75 | """ |
| 76 | 构建用户和帖子的ID映射 |
| 77 | """ |
| 78 | user2idx = {uid: idx for idx, uid in enumerate(sorted(user_set))} |
| 79 | post2idx = {pid: idx for idx, pid in enumerate(sorted(post_set))} |
| 80 | return user2idx, post2idx |
| 81 | |
| 82 | |
| 83 | def group_and_write(records, user2idx, post2idx, output_path="./app/user_post_graph.txt"): |
| 84 | """ |
| 85 | 将记录按用户分组并写入文件,支持行为权重 |
| 86 | """ |
| 87 | user_items = defaultdict(list) |
| 88 | user_times = defaultdict(list) |
| 89 | user_weights = defaultdict(list) |
| 90 | |
| 91 | for user_id, post_id, ts, weight in records: |
| 92 | uid = user2idx[user_id] |
| 93 | pid = post2idx[post_id] |
| 94 | user_items[uid].append(pid) |
| 95 | user_times[uid].append(ts) |
| 96 | user_weights[uid].append(weight) |
| 97 | |
| 98 | with open(output_path, "w", encoding="utf-8") as f: |
| 99 | for uid in sorted(user_items.keys()): |
| 100 | items = " ".join(str(item) for item in user_items[uid]) |
| 101 | times = " ".join(str(t) for t in user_times[uid]) |
| 102 | weights = " ".join(str(w) for w in user_weights[uid]) |
| 103 | f.write(f"{uid}\t{items}\t{times}\t{weights}\n") |
| 104 | |
| 105 | |
| 106 | def build_user_post_graph(return_mapping=False): |
| 107 | """ |
| 108 | 构建用户-帖子交互图 |
| 109 | """ |
| 110 | behavior_rows = fetch_user_post_data() |
| 111 | records, user_set, post_set = process_records(behavior_rows) |
| 112 | user2idx, post2idx = build_id_maps(user_set, post_set) |
| 113 | group_and_write(records, user2idx, post2idx) |
| 114 | if return_mapping: |
| 115 | return user2idx, post2idx |