wht | 4769537 | 2025-06-07 17:23:42 +0800 | [diff] [blame] | 1 | import pymysql |
| 2 | import datetime |
| 3 | from collections import defaultdict |
| 4 | |
| 5 | SqlURL = "10.126.59.25" |
| 6 | SqlPort = 3306 |
| 7 | Database = "pt_database_test" |
| 8 | SqlUsername = "root" |
| 9 | SqlPassword = "123456" |
| 10 | |
| 11 | |
wht | 4703881 | 2025-06-09 23:33:09 +0800 | [diff] [blame^] | 12 | def fetch_user_seed_data(): |
wht | 4769537 | 2025-06-07 17:23:42 +0800 | [diff] [blame] | 13 | conn = pymysql.connect( |
| 14 | host=SqlURL, |
| 15 | port=SqlPort, |
| 16 | user=SqlUsername, |
| 17 | password=SqlPassword, |
| 18 | database=Database, |
| 19 | charset="utf8mb4" |
| 20 | ) |
| 21 | cursor = conn.cursor() |
| 22 | cursor.execute("SELECT user_id, seed_id, download_start FROM SeedDownload") |
| 23 | download_rows = cursor.fetchall() |
| 24 | cursor.execute("SELECT user_id, seed_id, created_at FROM UserFavorite") |
| 25 | favorite_rows = cursor.fetchall() |
| 26 | cursor.close() |
| 27 | conn.close() |
| 28 | return download_rows, favorite_rows |
| 29 | |
| 30 | |
| 31 | def process_records(download_rows, favorite_rows): |
| 32 | records = [] |
| 33 | user_set = set() |
| 34 | seed_set = set() |
| 35 | for row in download_rows: |
| 36 | user_id, seed_id, created_at = row |
| 37 | user_set.add(user_id) |
| 38 | seed_set.add(seed_id) |
| 39 | if isinstance(created_at, datetime.datetime): |
| 40 | ts = int(created_at.timestamp()) |
| 41 | else: |
| 42 | ts = 0 |
| 43 | records.append((user_id, seed_id, ts)) |
| 44 | for row in favorite_rows: |
| 45 | user_id, seed_id, created_at = row |
| 46 | user_set.add(user_id) |
| 47 | seed_set.add(seed_id) |
| 48 | if isinstance(created_at, datetime.datetime): |
| 49 | ts = int(created_at.timestamp()) |
| 50 | else: |
| 51 | ts = 0 |
| 52 | records.append((user_id, seed_id, ts)) |
| 53 | return records, user_set, seed_set |
| 54 | |
| 55 | |
| 56 | def build_id_maps(user_set, seed_set): |
| 57 | user2idx = {uid: idx for idx, uid in enumerate(sorted(user_set))} |
| 58 | seed2idx = {sid: idx for idx, sid in enumerate(sorted(seed_set))} |
| 59 | return user2idx, seed2idx |
| 60 | |
| 61 | |
| 62 | def group_and_write(records, user2idx, seed2idx, output_path="./user_seed_graph.txt"): |
| 63 | user_items = defaultdict(list) |
| 64 | user_times = defaultdict(list) |
| 65 | for user_id, seed_id, ts in records: |
| 66 | uid = user2idx[user_id] |
| 67 | sid = seed2idx[seed_id] |
| 68 | user_items[uid].append(sid) |
| 69 | user_times[uid].append(ts) |
wht | 4769537 | 2025-06-07 17:23:42 +0800 | [diff] [blame] | 70 | with open(output_path, "w", encoding="utf-8") as f: |
| 71 | for uid in sorted(user_items.keys()): |
| 72 | items = " ".join(str(item) for item in user_items[uid]) |
| 73 | times = " ".join(str(t) for t in user_times[uid]) |
| 74 | f.write(f"{uid}\t{items}\t{times}\n") |
| 75 | |
| 76 | |
wht | 4703881 | 2025-06-09 23:33:09 +0800 | [diff] [blame^] | 77 | def build_user_seed_graph(return_mapping=False): |
| 78 | download_rows, favorite_rows = fetch_user_seed_data() |
wht | 4769537 | 2025-06-07 17:23:42 +0800 | [diff] [blame] | 79 | records, user_set, seed_set = process_records(download_rows, favorite_rows) |
| 80 | user2idx, seed2idx = build_id_maps(user_set, seed_set) |
| 81 | group_and_write(records, user2idx, seed2idx) |
wht | 4703881 | 2025-06-09 23:33:09 +0800 | [diff] [blame^] | 82 | if return_mapping: |
| 83 | return user2idx, seed2idx |