blob: a453e4e90fb40b4318c2f56f2d137ee2e5e8e866 [file] [log] [blame]
import pymysql
import datetime
from collections import defaultdict
SqlURL = "10.126.59.25"
SqlPort = 3306
Database = "redbook" # 修改为redbook数据库
SqlUsername = "root"
SqlPassword = "123456"
def fetch_user_post_data():
"""
从redbook数据库的behaviors表获取用户-帖子交互数据,只包含已发布的帖子
"""
conn = pymysql.connect(
host=SqlURL,
port=SqlPort,
user=SqlUsername,
password=SqlPassword,
database=Database,
charset="utf8mb4"
)
cursor = conn.cursor()
# 获取用户行为数据,只包含已发布帖子的行为数据
cursor.execute("""
SELECT b.user_id, b.post_id, b.type, b.value, b.created_at
FROM behaviors b
INNER JOIN posts p ON b.post_id = p.id
WHERE b.type IN ('like', 'favorite', 'comment', 'view', 'share')
AND p.status = 'published'
ORDER BY b.created_at
""")
behavior_rows = cursor.fetchall()
cursor.close()
conn.close()
return behavior_rows
def process_records(behavior_rows):
"""
处理用户行为记录,为不同类型的行为分配权重
"""
records = []
user_set = set()
post_set = set()
# 为不同行为类型分配权重
behavior_weights = {
'view': 1,
'like': 2,
'comment': 3,
'share': 4,
'favorite': 5
}
for row in behavior_rows:
user_id, post_id, behavior_type, value, created_at = row
user_set.add(user_id)
post_set.add(post_id)
if isinstance(created_at, datetime.datetime):
ts = int(created_at.timestamp())
else:
ts = 0
# 使用行为权重
weight = behavior_weights.get(behavior_type, 1) * (value or 1)
records.append((user_id, post_id, ts, weight))
return records, user_set, post_set
def build_id_maps(user_set, post_set):
"""
构建用户和帖子的ID映射
"""
user2idx = {uid: idx for idx, uid in enumerate(sorted(user_set))}
post2idx = {pid: idx for idx, pid in enumerate(sorted(post_set))}
return user2idx, post2idx
def group_and_write(records, user2idx, post2idx, output_path="./app/user_post_graph.txt"):
"""
将记录按用户分组并写入文件,支持行为权重
"""
user_items = defaultdict(list)
user_times = defaultdict(list)
user_weights = defaultdict(list)
for user_id, post_id, ts, weight in records:
uid = user2idx[user_id]
pid = post2idx[post_id]
user_items[uid].append(pid)
user_times[uid].append(ts)
user_weights[uid].append(weight)
with open(output_path, "w", encoding="utf-8") as f:
for uid in sorted(user_items.keys()):
items = " ".join(str(item) for item in user_items[uid])
times = " ".join(str(t) for t in user_times[uid])
weights = " ".join(str(w) for w in user_weights[uid])
f.write(f"{uid}\t{items}\t{times}\t{weights}\n")
def build_user_post_graph(return_mapping=False):
"""
构建用户-帖子交互图
"""
behavior_rows = fetch_user_post_data()
records, user_set, post_set = process_records(behavior_rows)
user2idx, post2idx = build_id_maps(user_set, post_set)
group_and_write(records, user2idx, post2idx)
if return_mapping:
return user2idx, post2idx