新增管理员页面和用户申诉、迁移审核页面,推荐系统

Change-Id: Ief5646321feb98fadb17da4b4e91caeaacdbacc5
diff --git a/recommend/hello.py b/recommend/hello.py
deleted file mode 100644
index c6d4e16..0000000
--- a/recommend/hello.py
+++ /dev/null
@@ -1 +0,0 @@
-print("Hello G10!")
diff --git a/recommend/inference.py b/recommend/inference.py
new file mode 100644
index 0000000..697b569
--- /dev/null
+++ b/recommend/inference.py
@@ -0,0 +1,54 @@
+import sys
+sys.path.append('./')
+
+from os import path
+from utils.parse_args import args
+from utils.dataloader import EdgeListData
+from model.LightGCN import LightGCN
+import torch
+import numpy as np
+import time
+
+# 计时:脚本开始
+t_start = time.time()
+
+# 配置参数
+args.data_path = './'
+args.device = 'cuda:7'
+args.pre_model_path = './model/LightGCN_pretrained.pt'
+
+# 1. 加载数据集
+t_data_start = time.time()
+pretrain_data = path.join(args.data_path, "uig.txt")
+pretrain_val_data = path.join(args.data_path, "uig.txt")
+dataset = EdgeListData(pretrain_data, pretrain_val_data)
+t_data_end = time.time()
+
+
+# 2. 加载LightGCN模型
+pretrained_dict = torch.load(args.pre_model_path, map_location=args.device, weights_only=True)
+pretrained_dict['user_embedding'] = pretrained_dict['user_embedding'][:dataset.num_users]
+pretrained_dict['item_embedding'] = pretrained_dict['item_embedding'][:dataset.num_items]
+
+model = LightGCN(dataset, phase='vanilla').to(args.device)
+model.load_state_dict(pretrained_dict, strict=False)
+model.eval()
+
+# 3. 输入用户ID
+user_id = 1
+
+# 4. 推理:获取embedding并打分
+t_infer_start = time.time()
+with torch.no_grad():
+    user_emb, item_emb = model.generate()
+    user_vec = user_emb[user_id].unsqueeze(0)
+    scores = model.rating(user_vec, item_emb).squeeze(0)
+    pred_item = torch.argmax(scores).item()
+t_infer_end = time.time()
+
+t_end = time.time()
+
+print(f"用户{user_id}下一个最可能点击的物品ID为: {pred_item}")
+print(f"加载数据集耗时: {t_data_end - t_data_start:.4f} 秒")
+print(f"推理耗时: {t_infer_end - t_infer_start:.4f} 秒")
+print(f"脚本总耗时: {t_end - t_start:.4f} 秒")
\ No newline at end of file
diff --git a/recommend/model/LightGCN.py b/recommend/model/LightGCN.py
new file mode 100644
index 0000000..b6b447e
--- /dev/null
+++ b/recommend/model/LightGCN.py
@@ -0,0 +1,121 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import scipy.sparse as sp
+import math
+import networkx as nx
+import random
+from copy import deepcopy
+from utils.parse_args import args
+from model.base_model import BaseModel
+from model.operators import EdgelistDrop
+from model.operators import scatter_add, scatter_sum
+
+
+init = nn.init.xavier_uniform_
+
+class LightGCN(BaseModel):
+    def __init__(self, dataset, pretrained_model=None, phase='pretrain'):
+        super().__init__(dataset)
+        self.adj = self._make_binorm_adj(dataset.graph)
+        self.edges = self.adj._indices().t()
+        self.edge_norm = self.adj._values()
+
+        self.phase = phase
+
+        self.emb_gate = lambda x: x
+
+        if self.phase == 'pretrain' or self.phase == 'vanilla' or self.phase == 'for_tune':
+            self.user_embedding = nn.Parameter(init(torch.empty(self.num_users, self.emb_size)))
+            self.item_embedding = nn.Parameter(init(torch.empty(self.num_items, self.emb_size)))
+
+
+        elif self.phase == 'finetune':
+            pre_user_emb, pre_item_emb = pretrained_model.generate()
+            self.user_embedding = nn.Parameter(pre_user_emb).requires_grad_(True)
+            self.item_embedding = nn.Parameter(pre_item_emb).requires_grad_(True)
+
+        elif self.phase == 'continue_tune':
+            # re-initialize for loading state dict
+            self.user_embedding = nn.Parameter(init(torch.empty(self.num_users, self.emb_size)))
+            self.item_embedding = nn.Parameter(init(torch.empty(self.num_items, self.emb_size)))
+
+        self.edge_dropout = EdgelistDrop()
+
+    def _agg(self, all_emb, edges, edge_norm):
+        src_emb = all_emb[edges[:, 0]]
+
+        # bi-norm
+        src_emb = src_emb * edge_norm.unsqueeze(1)
+
+        # conv
+        dst_emb = scatter_sum(src_emb, edges[:, 1], dim=0, dim_size=self.num_users+self.num_items)
+        return dst_emb
+    
+    def _edge_binorm(self, edges):
+        user_degs = scatter_add(torch.ones_like(edges[:, 0]), edges[:, 0], dim=0, dim_size=self.num_users)
+        user_degs = user_degs[edges[:, 0]]
+        item_degs = scatter_add(torch.ones_like(edges[:, 1]), edges[:, 1], dim=0, dim_size=self.num_items)
+        item_degs = item_degs[edges[:, 1]]
+        norm = torch.pow(user_degs, -0.5) * torch.pow(item_degs, -0.5)
+        return norm
+
+    def forward(self, edges, edge_norm, return_layers=False):
+        all_emb = torch.cat([self.user_embedding, self.item_embedding], dim=0)
+        all_emb = self.emb_gate(all_emb)
+        res_emb = [all_emb]
+        for l in range(args.num_layers):
+            all_emb = self._agg(res_emb[-1], edges, edge_norm)
+            res_emb.append(all_emb)
+        if not return_layers:
+            res_emb = sum(res_emb)
+            user_res_emb, item_res_emb = res_emb.split([self.num_users, self.num_items], dim=0)
+        else:
+            user_res_emb, item_res_emb = [], []
+            for emb in res_emb:
+                u_emb, i_emb = emb.split([self.num_users, self.num_items], dim=0)
+                user_res_emb.append(u_emb)
+                item_res_emb.append(i_emb)
+        return user_res_emb, item_res_emb
+    
+    def cal_loss(self, batch_data):
+        edges, dropout_mask = self.edge_dropout(self.edges, 1-args.edge_dropout, return_mask=True)
+        edge_norm = self.edge_norm[dropout_mask]
+
+        # forward
+        users, pos_items, neg_items = batch_data
+        user_emb, item_emb = self.forward(edges, edge_norm)
+        batch_user_emb = user_emb[users]
+        pos_item_emb = item_emb[pos_items]
+        neg_item_emb = item_emb[neg_items]
+        rec_loss = self._bpr_loss(batch_user_emb, pos_item_emb, neg_item_emb)
+        reg_loss = args.weight_decay * self._reg_loss(users, pos_items, neg_items)
+
+        loss = rec_loss + reg_loss
+        loss_dict = {
+            "rec_loss": rec_loss.item(),
+            "reg_loss": reg_loss.item(),
+        }
+        return loss, loss_dict
+    
+    @torch.no_grad()
+    def generate(self, return_layers=False):
+        return self.forward(self.edges, self.edge_norm, return_layers=return_layers)
+    
+    @torch.no_grad()
+    def generate_lgn(self, return_layers=False):
+        return self.forward(self.edges, self.edge_norm, return_layers=return_layers)
+    
+    @torch.no_grad()
+    def rating(self, user_emb, item_emb):
+        return torch.matmul(user_emb, item_emb.t())
+    
+    def _reg_loss(self, users, pos_items, neg_items):
+        u_emb = self.user_embedding[users]
+        pos_i_emb = self.item_embedding[pos_items]
+        neg_i_emb = self.item_embedding[neg_items]
+        reg_loss = (1/2)*(u_emb.norm(2).pow(2) +
+                          pos_i_emb.norm(2).pow(2) +
+                          neg_i_emb.norm(2).pow(2))/float(len(users))
+        return reg_loss
diff --git a/recommend/model/LightGCN_pretrained.pt b/recommend/model/LightGCN_pretrained.pt
new file mode 100644
index 0000000..825e0e2
--- /dev/null
+++ b/recommend/model/LightGCN_pretrained.pt
Binary files differ
diff --git a/recommend/model/base_model.py b/recommend/model/base_model.py
new file mode 100644
index 0000000..819442a
--- /dev/null
+++ b/recommend/model/base_model.py
@@ -0,0 +1,111 @@
+import torch
+import torch.nn as nn
+from utils.parse_args import args
+from scipy.sparse import csr_matrix
+import scipy.sparse as sp
+import numpy as np
+import torch.nn.functional as F
+
+
+class BaseModel(nn.Module):
+    def __init__(self, dataloader):
+        super(BaseModel, self).__init__()
+        self.num_users = dataloader.num_users
+        self.num_items = dataloader.num_items
+        self.emb_size = args.emb_size
+
+    def forward(self):
+        pass
+
+    def cal_loss(self, batch_data):
+        pass
+
+    def _check_inf(self, loss, pos_score, neg_score, edge_weight):
+        # find inf idx
+        inf_idx = torch.isinf(loss) | torch.isnan(loss)
+        if inf_idx.any():
+            print("find inf in loss")
+            if type(edge_weight) != int:
+                print(edge_weight[inf_idx])
+            print(f"pos_score: {pos_score[inf_idx]}")
+            print(f"neg_score: {neg_score[inf_idx]}")
+            raise ValueError("find inf in loss")
+
+    def _make_binorm_adj(self, mat):
+        a = csr_matrix((self.num_users, self.num_users))
+        b = csr_matrix((self.num_items, self.num_items))
+        mat = sp.vstack(
+            [sp.hstack([a, mat]), sp.hstack([mat.transpose(), b])])
+        mat = (mat != 0) * 1.0
+        # mat = (mat + sp.eye(mat.shape[0])) * 1.0# MARK
+        degree = np.array(mat.sum(axis=-1))
+        d_inv_sqrt = np.reshape(np.power(degree, -0.5), [-1])
+        d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0
+        d_inv_sqrt_mat = sp.diags(d_inv_sqrt)
+        mat = mat.dot(d_inv_sqrt_mat).transpose().dot(
+            d_inv_sqrt_mat).tocoo()
+
+        # make torch tensor
+        idxs = torch.from_numpy(np.vstack([mat.row, mat.col]).astype(np.int64))
+        vals = torch.from_numpy(mat.data.astype(np.float32))
+        shape = torch.Size(mat.shape)
+        return torch.sparse.FloatTensor(idxs, vals, shape).to(args.device)
+    
+    def _make_binorm_adj_self_loop(self, mat):
+        a = csr_matrix((self.num_users, self.num_users))
+        b = csr_matrix((self.num_items, self.num_items))
+        mat = sp.vstack(
+            [sp.hstack([a, mat]), sp.hstack([mat.transpose(), b])])
+        mat = (mat != 0) * 1.0
+        mat = (mat + sp.eye(mat.shape[0])) * 1.0 # self loop
+        degree = np.array(mat.sum(axis=-1))
+        d_inv_sqrt = np.reshape(np.power(degree, -0.5), [-1])
+        d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0
+        d_inv_sqrt_mat = sp.diags(d_inv_sqrt)
+        mat = mat.dot(d_inv_sqrt_mat).transpose().dot(
+            d_inv_sqrt_mat).tocoo()
+
+        # make torch tensor
+        idxs = torch.from_numpy(np.vstack([mat.row, mat.col]).astype(np.int64))
+        vals = torch.from_numpy(mat.data.astype(np.float32))
+        shape = torch.Size(mat.shape)
+        return torch.sparse.FloatTensor(idxs, vals, shape).to(args.device)
+
+
+    def _sp_matrix_to_sp_tensor(self, sp_matrix):
+        coo = sp_matrix.tocoo()
+        indices = torch.LongTensor([coo.row, coo.col])
+        values = torch.FloatTensor(coo.data)
+        return torch.sparse.FloatTensor(indices, values, coo.shape).coalesce().to(args.device)
+
+    def _bpr_loss(self, user_emb, pos_item_emb, neg_item_emb):
+        pos_score = (user_emb * pos_item_emb).sum(dim=1)
+        neg_score = (user_emb * neg_item_emb).sum(dim=1)
+        loss = -torch.log(1e-10 + torch.sigmoid((pos_score - neg_score)))
+        self._check_inf(loss, pos_score, neg_score, 0)
+        return loss.mean()
+    
+    def _nce_loss(self, pos_score, neg_score, edge_weight=1):
+        numerator = torch.exp(pos_score)
+        denominator = torch.exp(pos_score) + torch.exp(neg_score).sum(dim=1)
+        loss = -torch.log(numerator/denominator) * edge_weight
+        self._check_inf(loss, pos_score, neg_score, edge_weight)
+        return loss.mean()
+    
+    def _infonce_loss(self, pos_1, pos_2, negs, tau):
+        pos_1 = self.cl_mlp(pos_1)
+        pos_2 = self.cl_mlp(pos_2)
+        negs = self.cl_mlp(negs)
+        pos_1 = F.normalize(pos_1, dim=-1)
+        pos_2 = F.normalize(pos_2, dim=-1)
+        negs = F.normalize(negs, dim=-1)
+        pos_score = torch.mul(pos_1, pos_2).sum(dim=1)
+        # B, 1, E * B, E, N -> B, N
+        neg_score = torch.bmm(pos_1.unsqueeze(1), negs.transpose(1, 2)).squeeze(1)
+        # infonce loss
+        numerator = torch.exp(pos_score / tau)
+        denominator = torch.exp(pos_score / tau) + torch.exp(neg_score / tau).sum(dim=1)
+        loss = -torch.log(numerator/denominator)
+        self._check_inf(loss, pos_score, neg_score, 0)
+        return loss.mean()
+    
\ No newline at end of file
diff --git a/recommend/model/operators.py b/recommend/model/operators.py
new file mode 100644
index 0000000..a508966
--- /dev/null
+++ b/recommend/model/operators.py
@@ -0,0 +1,52 @@
+import torch
+from typing import Optional, Tuple
+from torch import nn
+
+def broadcast(src: torch.Tensor, other: torch.Tensor, dim: int):
+    if dim < 0:
+        dim = other.dim() + dim
+    if src.dim() == 1:
+        for _ in range(0, dim):
+            src = src.unsqueeze(0)
+    for _ in range(src.dim(), other.dim()):
+        src = src.unsqueeze(-1)
+    src = src.expand(other.size())
+    return src
+
+def scatter_sum(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
+                out: Optional[torch.Tensor] = None,
+                dim_size: Optional[int] = None) -> torch.Tensor:
+    index = broadcast(index, src, dim)
+    if out is None:
+        size = list(src.size())
+        if dim_size is not None:
+            size[dim] = dim_size
+        elif index.numel() == 0:
+            size[dim] = 0
+        else:
+            size[dim] = int(index.max()) + 1
+        out = torch.zeros(size, dtype=src.dtype, device=src.device)
+        return out.scatter_add_(dim, index, src)
+    else:
+        return out.scatter_add_(dim, index, src)
+
+def scatter_add(src: torch.Tensor, index: torch.Tensor, dim: int = -1,
+                out: Optional[torch.Tensor] = None,
+                dim_size: Optional[int] = None) -> torch.Tensor:
+    return scatter_sum(src, index, dim, out, dim_size)
+
+
+class EdgelistDrop(nn.Module):
+    def __init__(self):
+        super(EdgelistDrop, self).__init__()
+
+    def forward(self, edgeList, keep_rate, return_mask=False):
+        if keep_rate == 1.0:
+            return edgeList, torch.ones(edgeList.size(0)).type(torch.bool)
+        edgeNum = edgeList.size(0)
+        mask = (torch.rand(edgeNum) + keep_rate).floor().type(torch.bool)
+        newEdgeList = edgeList[mask, :]
+        if return_mask:
+            return newEdgeList, mask
+        else:
+            return newEdgeList
diff --git a/recommend/uig.txt b/recommend/uig.txt
new file mode 100644
index 0000000..5846057
--- /dev/null
+++ b/recommend/uig.txt
@@ -0,0 +1,2 @@
+0	1 3 9 12 5 7 6 8 4	1511683379 1511683385 1511683431 1511683453 1511683481 1511692992 1511693011 1511693077 1511787191

+1	10 11 2	1511578239 1511594732 1511664627
\ No newline at end of file
diff --git a/recommend/utils/dataloader.py b/recommend/utils/dataloader.py
new file mode 100644
index 0000000..d519f17
--- /dev/null
+++ b/recommend/utils/dataloader.py
@@ -0,0 +1,92 @@
+from utils.parse_args import args
+from os import path
+from tqdm import tqdm
+import numpy as np
+import scipy.sparse as sp
+import torch
+import networkx as nx
+from copy import deepcopy
+from collections import defaultdict
+import pandas as pd
+
+
+class EdgeListData:
+    def __init__(self, train_file, test_file, phase='pretrain', pre_dataset=None, has_time=True):
+        self.phase = phase
+        self.has_time = has_time
+        self.pre_dataset = pre_dataset
+
+        self.hour_interval = args.hour_interval_pre if phase == 'pretrain' else args.hour_interval_f
+
+        self.edgelist = []
+        self.edge_time = []
+        self.num_users = 0
+        self.num_items = 0
+        self.num_edges = 0
+
+        self.train_user_dict = {}
+        self.test_user_dict = {}
+
+        self._load_data(train_file, test_file, has_time)
+
+        if phase == 'pretrain':
+            self.user_hist_dict = self.train_user_dict
+        
+        users_has_hist = set(list(self.user_hist_dict.keys()))
+        all_users = set(list(range(self.num_users)))
+        users_no_hist = all_users - users_has_hist
+        for u in users_no_hist:
+            self.user_hist_dict[u] = []
+
+    def _read_file(self, train_file, test_file, has_time=True):
+        with open(train_file, 'r') as f:
+            for line in f:
+                line = line.strip().split('\t')
+                if not has_time:
+                    user, items = line[:2]
+                    times = " ".join(["0"] * len(items.split(" ")))
+                else:
+                    user, items, times = line
+                    
+                for i in items.split(" "):
+                    self.edgelist.append((int(user), int(i)))
+                for i in times.split(" "):
+                    self.edge_time.append(int(i))
+                self.train_user_dict[int(user)] = [int(i) for i in items.split(" ")]
+
+        self.test_edge_num = 0
+        with open(test_file, 'r') as f:
+            for line in f:
+                line = line.strip().split('\t')
+                user, items = line[:2]
+                self.test_user_dict[int(user)] = [int(i) for i in items.split(" ")]
+                self.test_edge_num += len(self.test_user_dict[int(user)])
+
+    def _load_data(self, train_file, test_file, has_time=True):
+        self._read_file(train_file, test_file, has_time)
+
+        self.edgelist = np.array(self.edgelist, dtype=np.int32)
+        self.edge_time = 1 + self.timestamp_to_time_step(np.array(self.edge_time, dtype=np.int32))
+        self.num_edges = len(self.edgelist)
+        if self.pre_dataset is not None:
+            self.num_users = self.pre_dataset.num_users
+            self.num_items = self.pre_dataset.num_items
+        else:
+            self.num_users = max([np.max(self.edgelist[:, 0]) + 1, np.max(list(self.test_user_dict.keys())) + 1])
+            self.num_items = max([np.max(self.edgelist[:, 1]) + 1, np.max([np.max(self.test_user_dict[u]) for u in self.test_user_dict.keys()]) + 1])
+
+        self.graph = sp.coo_matrix((np.ones(self.num_edges), (self.edgelist[:, 0], self.edgelist[:, 1])), shape=(self.num_users, self.num_items))
+
+        if self.has_time:
+            self.edge_time_dict = defaultdict(dict)
+            for i in range(len(self.edgelist)):
+                self.edge_time_dict[self.edgelist[i][0]][self.edgelist[i][1]+self.num_users] = self.edge_time[i]
+                self.edge_time_dict[self.edgelist[i][1]+self.num_users][self.edgelist[i][0]] = self.edge_time[i]
+
+    def timestamp_to_time_step(self, timestamp_arr, least_time=None):
+        interval_hour = self.hour_interval
+        if least_time is None:
+            least_time = np.min(timestamp_arr)
+        timestamp_arr = timestamp_arr - least_time
+        timestamp_arr = timestamp_arr // (interval_hour * 3600)
+        return timestamp_arr
diff --git a/recommend/utils/parse_args.py b/recommend/utils/parse_args.py
new file mode 100644
index 0000000..3e86a47
--- /dev/null
+++ b/recommend/utils/parse_args.py
@@ -0,0 +1,57 @@
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='GraphPro')
+    parser.add_argument('--phase', type=str, default='pretrain')
+    parser.add_argument('--plugin', action='store_true', default=False)
+    parser.add_argument('--save_path', type=str, default="saved" ,help='where to save model and logs')
+    parser.add_argument('--data_path', type=str, default="dataset/yelp",help='where to load data')
+    parser.add_argument('--exp_name', type=str, default='1')
+    parser.add_argument('--desc', type=str, default='')
+    parser.add_argument('--ab', type=str, default='full')
+    parser.add_argument('--log', type=int, default=1)
+
+    parser.add_argument('--device', type=str, default="cuda")
+    parser.add_argument('--model', type=str, default='GraphPro')
+    parser.add_argument('--pre_model', type=str, default='GraphPro')
+    parser.add_argument('--f_model', type=str, default='GraphPro')
+    parser.add_argument('--pre_model_path', type=str, default='pretrained_model.pt')
+
+    parser.add_argument('--hour_interval_pre', type=float, default=1)
+    parser.add_argument('--hour_interval_f', type=int, default=1)
+    parser.add_argument('--emb_dropout', type=float, default=0)
+
+    parser.add_argument('--updt_inter', type=int, default=1)
+    parser.add_argument('--samp_decay', type=float, default=0.05)
+    
+    parser.add_argument('--edge_dropout', type=float, default=0.5)
+    parser.add_argument('--emb_size', type=int, default=64)
+    parser.add_argument('--batch_size', type=int, default=2048)
+    parser.add_argument('--eval_batch_size', type=int, default=512)
+    parser.add_argument('--seed', type=int, default=2023)
+    parser.add_argument('--num_epochs', type=int, default=300)
+    parser.add_argument('--neighbor_sample_num', type=int, default=5)
+    parser.add_argument('--lr', type=float, default=0.001)
+    parser.add_argument('--weight_decay', type=float, default=1e-4)
+    parser.add_argument('--metrics', type=str, default='recall;ndcg')
+    parser.add_argument('--metrics_k', type=str, default='20')
+    parser.add_argument('--early_stop_patience', type=int, default=10)
+    parser.add_argument('--neg_num', type=int, default=1)
+
+    parser.add_argument('--num_layers', type=int, default=3)
+
+
+    return parser
+
+parser = parse_args()
+args = parser.parse_known_args()[0]
+if args.pre_model == args.f_model:
+    args.model = args.pre_model
+elif args.pre_model != 'LightGCN':
+    args.model = args.pre_model
+
+args = parser.parse_args()
+if args.pre_model == args.f_model:
+    args.model = args.pre_model
+elif args.pre_model != 'LightGCN':
+    args.model = args.pre_model
\ No newline at end of file