新增管理员页面和用户申诉、迁移审核页面,推荐系统
Change-Id: Ief5646321feb98fadb17da4b4e91caeaacdbacc5
diff --git a/recommend/utils/dataloader.py b/recommend/utils/dataloader.py
new file mode 100644
index 0000000..d519f17
--- /dev/null
+++ b/recommend/utils/dataloader.py
@@ -0,0 +1,92 @@
+from utils.parse_args import args
+from os import path
+from tqdm import tqdm
+import numpy as np
+import scipy.sparse as sp
+import torch
+import networkx as nx
+from copy import deepcopy
+from collections import defaultdict
+import pandas as pd
+
+
+class EdgeListData:
+ def __init__(self, train_file, test_file, phase='pretrain', pre_dataset=None, has_time=True):
+ self.phase = phase
+ self.has_time = has_time
+ self.pre_dataset = pre_dataset
+
+ self.hour_interval = args.hour_interval_pre if phase == 'pretrain' else args.hour_interval_f
+
+ self.edgelist = []
+ self.edge_time = []
+ self.num_users = 0
+ self.num_items = 0
+ self.num_edges = 0
+
+ self.train_user_dict = {}
+ self.test_user_dict = {}
+
+ self._load_data(train_file, test_file, has_time)
+
+ if phase == 'pretrain':
+ self.user_hist_dict = self.train_user_dict
+
+ users_has_hist = set(list(self.user_hist_dict.keys()))
+ all_users = set(list(range(self.num_users)))
+ users_no_hist = all_users - users_has_hist
+ for u in users_no_hist:
+ self.user_hist_dict[u] = []
+
+ def _read_file(self, train_file, test_file, has_time=True):
+ with open(train_file, 'r') as f:
+ for line in f:
+ line = line.strip().split('\t')
+ if not has_time:
+ user, items = line[:2]
+ times = " ".join(["0"] * len(items.split(" ")))
+ else:
+ user, items, times = line
+
+ for i in items.split(" "):
+ self.edgelist.append((int(user), int(i)))
+ for i in times.split(" "):
+ self.edge_time.append(int(i))
+ self.train_user_dict[int(user)] = [int(i) for i in items.split(" ")]
+
+ self.test_edge_num = 0
+ with open(test_file, 'r') as f:
+ for line in f:
+ line = line.strip().split('\t')
+ user, items = line[:2]
+ self.test_user_dict[int(user)] = [int(i) for i in items.split(" ")]
+ self.test_edge_num += len(self.test_user_dict[int(user)])
+
+ def _load_data(self, train_file, test_file, has_time=True):
+ self._read_file(train_file, test_file, has_time)
+
+ self.edgelist = np.array(self.edgelist, dtype=np.int32)
+ self.edge_time = 1 + self.timestamp_to_time_step(np.array(self.edge_time, dtype=np.int32))
+ self.num_edges = len(self.edgelist)
+ if self.pre_dataset is not None:
+ self.num_users = self.pre_dataset.num_users
+ self.num_items = self.pre_dataset.num_items
+ else:
+ self.num_users = max([np.max(self.edgelist[:, 0]) + 1, np.max(list(self.test_user_dict.keys())) + 1])
+ self.num_items = max([np.max(self.edgelist[:, 1]) + 1, np.max([np.max(self.test_user_dict[u]) for u in self.test_user_dict.keys()]) + 1])
+
+ self.graph = sp.coo_matrix((np.ones(self.num_edges), (self.edgelist[:, 0], self.edgelist[:, 1])), shape=(self.num_users, self.num_items))
+
+ if self.has_time:
+ self.edge_time_dict = defaultdict(dict)
+ for i in range(len(self.edgelist)):
+ self.edge_time_dict[self.edgelist[i][0]][self.edgelist[i][1]+self.num_users] = self.edge_time[i]
+ self.edge_time_dict[self.edgelist[i][1]+self.num_users][self.edgelist[i][0]] = self.edge_time[i]
+
+ def timestamp_to_time_step(self, timestamp_arr, least_time=None):
+ interval_hour = self.hour_interval
+ if least_time is None:
+ least_time = np.min(timestamp_arr)
+ timestamp_arr = timestamp_arr - least_time
+ timestamp_arr = timestamp_arr // (interval_hour * 3600)
+ return timestamp_arr