From 0240d278969919f89e8604449668504d6b63a30e Mon Sep 17 00:00:00 2001 From: Jingfei Hou <houjf20@mails.tsinghua.edu.cn> Date: Tue, 10 Oct 2023 08:57:13 +0000 Subject: [PATCH] Upload New File --- train_heavy.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 train_heavy.ipynb diff --git a/train_heavy.ipynb b/train_heavy.ipynb new file mode 100644 index 0000000..ebcbee3 --- /dev/null +++ b/train_heavy.ipynb @@ -0,0 +1 @@ +{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-08-21T09:50:39.913747Z","iopub.execute_input":"2023-08-21T09:50:39.914413Z","iopub.status.idle":"2023-08-21T09:50:39.937846Z","shell.execute_reply.started":"2023-08-21T09:50:39.914377Z","shell.execute_reply":"2023-08-21T09:50:39.935978Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import os\nos.environ['KMP_DUPLICATE_LIB_OK']='True'\nimport pandas as pd\nimport torch\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nimport torch.nn as nn\nfrom torch import optim\nfrom torch.utils.data import Dataset\nfrom torch.utils.data import DataLoader\nfrom scipy import stats\nfrom sklearn import metrics\nnp.set_printoptions(threshold=np.inf)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndef get_blosum62_index(char):\n blosum62 = {\n 'A': [ 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -3, -1, 1, 0, -3, -2, 0],\n 'R': [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],\n 'N': [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],\n 'D': [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -4, -1, 0, -1, -4, -3, -3],\n 'C': [ 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],\n 'Q': [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],\n 'E': [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],\n 'G': [ 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -4, -2, 0, -2, -4, -3, -2],\n 'H': [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],\n 'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],\n 'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],\n 'K': [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2],\n 'M': [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1],\n 'F': [-3, -3, -3, -4, -2, -3, -3, -4, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1],\n 'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2],\n 'S': [ 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],\n 'T': [ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0],\n 'W': [-3, -3, -4, -4, -2, -2, -3, -4, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3],\n 'Y': [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1],\n 'V': [ 0, -3, -3, -3, -1, -2, -2, -2, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4]\n }\n keys_list = list(blosum62.keys())\n return keys_list.index(char)\n\n\ndef one_hot(data : np.ndarray, Is_one_hot = True):\n # data.shape = (6)\n amino_acids = np.array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',\n 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])\n one_hot = np.zeros((20, 20))\n for i in range(20):\n one_hot[i, i] = 1\n new_data = []\n for i in range(len(data)):\n idx = np.where(amino_acids == data[i])\n new = one_hot[idx[0][0]] if Is_one_hot else idx[0][0]\n new_data.append(new)\n return np.asarray(new_data).reshape(-1)\n\n\n# 对氨基酸åºåˆ—进行编ç blosum62\ndef encoding(aa):\n blosum = np.array([\n [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],\n [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],\n [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],\n [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],\n [0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],\n [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],\n [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],\n [0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],\n [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],\n [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],\n [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],\n [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2],\n [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1],\n [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1],\n [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2],\n [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],\n [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0],\n [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3],\n [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1],\n [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4]\n ])\n new_seq = []\n for i in aa:\n new_seq.append(blosum[get_blosum62_index(i)])\n flat_list = [item for sublist in new_seq for item in sublist]\n return flat_list\n\n\n# 读å–æ•°æ®äº§ç”Ÿref_listå’Œtarget_list å‚æ•°å¯æŒ‡å®šç‰©ç§\n# data_path为one_targetæ•°æ®ï¼Œ ref_path为å‚考集数æ®\n# ps:有待讨论:如何设置refå’Œtargetæ•°æ®çš„比例\ndef get_data_list(data_path, ref_path, tar_specie):\n # 获å–å‚考数æ®é›†æ•°æ®å¹¶ç›å‡ºtarget物ç§\n X_ref, y_ref = [], []\n X_target, y_target = [], []\n df_data = pd.read_csv(data_path, low_memory=False)\n df = pd.DataFrame(df_data)\n df = df[df['label'].map(lambda x: x == tar_specie)]\n specie = df['label'].tolist()\n # 获å–ä¸€å…±æœ‰å‡ ç§ç‰©ç§å’Œç‰©ç§å称\n specie_set = list(set(specie))\n specie_set.sort()\n # 获å–åºåˆ—并拼接,转æ¢æˆtensor\n df = df.drop(['y', 'label'], axis=1)\n df = df.values.tolist()\n for i in range(len(df)):\n X_target.append(torch.tensor(df[i]).float())\n y_target.append(specie_set.index(tar_specie))\n m1 = len(y_target)\n if m1 > 500:\n X_target, X_or, y_target, y_or = train_test_split(X_target, y_target, test_size=1-500/m1, random_state=42)\n # 获å–å‚考数æ®é›†æ•°æ®å¹¶ç›å‡ºtarget物ç§\n ref_data = pd.read_csv(ref_path, low_memory=False)\n df_ref = pd.DataFrame(ref_data)\n df_ref = df_ref[df_ref['label'].map(lambda x: x != tar_specie)]\n species_ref = df_ref['label'].tolist()\n # 获å–ä¸€å…±æœ‰å‡ ç§ç‰©ç§å’Œç‰©ç§å称\n specie_ref_set = list(set(species_ref))\n specie_ref_set.sort()\n # 获å–åºåˆ—并拼接,转æ¢æˆtensor\n df_ref = df_ref.drop(['y', 'label'], axis=1)\n df_ref = df_ref.values.tolist()\n for i in range(len(df_ref)):\n X_ref.append(torch.tensor(df_ref[i]).float())\n y_ref.append(specie_ref_set.index(species_ref[i]))\n return X_ref, y_ref, X_target, y_target\n\n\n# 为å‚考数æ®é›†é‡æ–°è®¾å®šlabel\ndef label_update(y_ref):\n y_update = []\n myset = set(y_ref)\n nums = [i for i in range(len(myset))]\n nvs = zip(myset, nums)\n nvDict = dict((name, value) for name, value in nvs)\n for i in range(len(y_ref)):\n y_update.append(nvDict[y_ref[i]])\n return y_update\n\n\n# å°†ref_listå’Œtarget_list转æ¢ä¸ºdataset\n# 定义自定义数æ®é›†ç±», ä¼ å…¥list\nclass FRDataset(Dataset):\n def __init__(self, data, labels):\n self.data_ori = data\n self.data = data\n self.labels = labels\n\n def __len__(self):\n return len(self.data)\n\n def __getitem__(self, idx):\n sample = self.data[idx]\n label = self.labels[idx]\n return sample, label\n\n\n# 构建模型和æŸå¤±å‡½æ•°è¿›è¡Œè®ç»ƒï¼Œè¿”回è®ç»ƒåŽçš„模型/或者sample一些target——train,得到æå–åŽçš„特å¾ï¼ŒèŽ·å¾—å•ç±»ç‰¹å¾å‘é‡\n# 模型选择,åˆæ¥ä½¿ç”¨äº†transformerçš„ç¼–ç 器\n# 这里之åŽè¦é¢„è®ç»ƒä¸€ä¸ªtransformer,直接调用预è®ç»ƒçš„å‚æ•°æ¥åˆå§‹åŒ–\nclass TransEncoder(nn.Module):\n def __init__(self):\n super(TransEncoder, self).__init__()\n # embeddingå·²ç»åœ¨å¤–部写好了\n # nhead should be divided by d_model\n encoder_layer = nn.TransformerEncoderLayer(d_model=20, nhead=5)\n self.TransformerEncoder = nn.TransformerEncoder(encoder_layer, num_layers=6)\n\n def forward(self, x):\n # xçš„shape(batch-size, seq-len95, aa-dim20)\n out = self.TransformerEncoder(x)\n # 将三维转å˜ä¸ºäºŒç»´(batch-size, seq-len*aa-dim1900)\n out = out.view(out.size(0), -1)\n return out\n\n\n# 定义ref model,两个model共享encoder\nclass RefEncoder(nn.Module):\n # num_classes需è¦è¯»å–refçš„ç§ç±»\n def __init__(self, encoder, num_classes, input_dim=1900):\n super(RefEncoder, self).__init__()\n self.encoder = encoder\n # 实际fc层更新?\n self.fc = nn.Linear(input_dim, num_classes)\n\n def forward(self, x):\n # xçš„shape(batch-size, seq-len95, aa-dim20)\n out = self.encoder(x)\n # 将三维转å˜ä¸ºäºŒç»´(batch-size, seq-len*aa-dim1900)\n # out = out.view(out.size(0), -1)\n # 过分类头\n out = self.fc(out)\n return out\n\n\n# 定义target model,两个model共享encoder\nclass TarEncoder(nn.Module):\n # num_classes需è¦è¯»å–refçš„ç§ç±»\n def __init__(self, encoder):\n super(TarEncoder, self).__init__()\n self.encoder = encoder\n\n def forward(self, x):\n # xçš„shape(batch-size, seq-len95, aa-dim20)\n out = self.encoder(x)\n # 将三维转å˜ä¸ºäºŒç»´(batch-size, seq-len*aa-dim1900)\n # out = out.view(out.size(0), -1)\n return out\n\n\n# 交å‰ç†µæŸå¤±ï¼Œæ述性æŸå¤±\nclass CrossEntropyWithLogit(nn.Module):\n def __init__(self) -> None:\n super().__init__()\n\n # lamda为控制紧凑性æŸå¤±å’Œæ述性æŸå¤±æ¯”例的å‚æ•°\n def forward(self, output:torch.Tensor, labels:torch.Tensor):\n # labels_one_hot = F.one_hot(labels).float().squeeze()\n cri = nn.CrossEntropyLoss()\n loss = cri(output, labels)\n return loss\n\n\n# 定义紧凑性æŸå¤±,outputçš„shape为(batch-size, seq-len*aa-dim1900)\n# classes为\ndef compactness_loss(output, batchsize, classes, lamda):\n m = output.mean(dim=0)\n #batchsize=25\n #classes=1900\n #output.shape=[25,1900]\n #m.shape=[1900]\n lc = 0\n for i in range(output.shape[0]):\n m1 = (batchsize * m - output[i]) / (batchsize-1)\n lc = lc + torch.matmul(output[i]-m1, output[i]-m1)\n lc = lamda * lc / (batchsize*classes)\n return lc\n\n\n# è®ç»ƒï¼Œæ¤å¤„没有冻结,注æ„:两个模型共享å‚æ•°\n# 这里更新å‚数是在一个epoch之åŽ\ndef train(model_ref: nn.Module,\n model_tar: nn.Module,\n ref_loader: DataLoader,\n tar_loader: DataLoader,\n optimizer_ref: torch.optim.Optimizer,\n optimizer_tar: torch.optim.Optimizer,\n description_loss: nn.Module, device, lamda, epoch):\n model_ref.train()\n model_tar.train()\n lc = []\n ld = []\n # 先更新model_tar\n for seq, label in tar_loader:\n seq = torch.reshape(seq, (-1, 95, 20))\n seq = seq.to(device)\n label = label.to(device)\n optimizer_tar.zero_grad()\n output = model_tar(seq)\n loss_lc = compactness_loss(output, output.shape[0], output.shape[1], lamda)\n lc.append(float(loss_lc))\n loss_lc.backward()\n optimizer_tar.step()\n # æ¤æ—¶å› 为共享encoder,所以model_refä¸çš„encoderå·²ç»è¢«æ›´æ–°\n for seq, label in ref_loader:\n seq = torch.reshape(seq, (-1, 95, 20))\n seq = seq.to(device)\n label = label.to(device)\n optimizer_ref.zero_grad()\n output = model_ref(seq)\n loss_ld = description_loss(output, label)\n ld.append(float(loss_ld))\n loss_ld.backward()\n optimizer_ref.step()\n '''if (epoch + 1) % 2 == 0:\n total_loss = 0\n optimizer_ref.zero_grad()\n lc1 = torch.tensor(lc[-1], requires_grad=True)\n ld1 = torch.tensor(ld[-1], requires_grad=True)\n total_loss = lc1 + ld1\n total_loss.backward()\n optimizer_ref.step()'''\n # æ¤æ—¶å› 为共享encoder,所以model_tarä¸çš„encoderå·²ç»è¢«æ›´æ–°\n # print(f\"epoch:{epoch} - batch:{batch_id}, loss:{loss.item():.4f}\")\n lc = sum(lc)/len(lc)\n ld = sum(ld) / len(ld)\n return lc, ld\n\n\n# 获å–æŸç±»çš„特å¾å‘é‡\ndef get_class_embedding(model_tar: nn.Module, tar_samples: DataLoader, device):\n model_tar.eval()\n with torch.no_grad():\n tar_embedding = []\n for seq, label in tar_samples:\n seq = torch.reshape(seq, (-1, 95, 20))\n seq = seq.to(device)\n label = label.to(device)\n output = model_tar(seq)\n tar_embedding.append(output.mean(0).cpu().numpy())\n # .cuda().cpu().numpy()\n tar_embedding = np.array(tar_embedding)\n # æ¤å¤„返回的是numpy类型\n tar_embedding = np.mean(tar_embedding, axis=0)\n return tar_embedding\n\n\n# 测试:选å–target_test 和其他数æ®ï¼Œè¿›è¡Œæµ‹è¯•\n# 需è¦å¤šä¸ªone class模型测试:model_tar改为多个è®ç»ƒå¥½çš„模型\ndef oneclass_test(tar_test: DataLoader, model_tar: nn.Module, device):\n model_tar.eval()\n with torch.no_grad():\n outputs = []\n for seq, label in tar_test:\n seq = torch.reshape(seq, (-1, 95, 20))\n seq = seq.to(device)\n label = label.to(device)\n output = model_tar(seq)\n outputs.append(output)\n # 在函数外å†ä¸Žæ¯ä¸ªclass的特å¾å‘é‡åšç›¸ä¼¼æ€§è®¡ç®—\n return outputs\n\n\n# 给定物ç§ï¼Œè°ƒç”¨å…¶ä»–函数划分è®ç»ƒé›†å’Œå‚考集,并为è®ç»ƒé›†å’Œå‚考集建立模型进行è®ç»ƒï¼Œè®ç»ƒä¹‹åŽè¿”回target_model和这个one class对应的特å¾å‘é‡ä»¥åŠæµ‹è¯•æ•°æ®\ndef oneclass(datapath, ref_path, specie):\n X_ref, y_ref, X_target, y_target = get_data_list(datapath, ref_path, specie)\n num_class = len(set(y_ref))\n # æ›´æ–°å‚考集label\n y_ref = label_update(y_ref)\n ref_loader = FRDataset(X_ref, y_ref)\n # tar_loader = FRDataset(X_target, y_target)\n # 划分测试集和è®ç»ƒé›†\n # 直接划分list\n X_tar_train, X_tar_test, y_tar_train, y_tar_test = train_test_split(X_target, y_target, test_size=0.3, random_state=0)\n train_tar = FRDataset(X_tar_train, y_tar_train)\n train_ref_dataloader = DataLoader(ref_loader, batch_size=128, shuffle=True)\n # test_ref_datatest = DataLoader(test_ref, batch_size=32, shuffle=True)\n train_tar_dataloader = DataLoader(train_tar, batch_size=32, shuffle=True)\n # test_tar_datatest = DataLoader(test_tar, batch_size=32, shuffle=True)\n # 创建模型\n # 这里的encoder采用预è®ç»ƒå¥½çš„模型\n encoder_shared = torch.load('/kaggle/input/new-dataset/save_new.pt')\n # encoder_shared = TransEncoder()\n model_ref = RefEncoder(encoder_shared, num_classes=num_class, input_dim=1900)\n model_tar = TarEncoder(encoder_shared)\n device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n print(device)\n encoder_shared = encoder_shared.to(device)\n model_tar = model_tar.to(device)\n model_ref = model_ref.to(device)\n description_loss = CrossEntropyWithLogit()\n # å‚数需è¦ä¿®æ”¹ï¼Œè¿™é‡Œä½¿ç”¨çš„å‚数为github上,但github上模型预è®ç»ƒè¿‡\n optimizer_tar = optim.Adam(model_tar.parameters(), lr=0.5e-3, weight_decay=0.00005)\n optimizer_ref = optim.Adam(model_ref.parameters(), lr=0.5e-3, weight_decay=0.00005)\n for epoch in range(10):\n lc, ld = train(model_ref, model_tar, train_ref_dataloader, train_tar_dataloader, optimizer_ref,\n optimizer_tar, description_loss, device, 0.1, epoch)\n print(f\"epoch:{epoch}, lc:{lc}, ld:{ld}\")\n # 从train_tar_datasetä¸sample部分用于生æˆè¿™ä¸€ç±»çš„特å¾å‘é‡\n train_size = int(len(train_tar) * 1)\n test_size = len(train_tar) - train_size\n tar_embedding, tar_other = torch.utils.data.random_split(train_tar, [train_size, test_size],\n generator=torch.Generator().manual_seed(0))\n tar_emb_dataset = DataLoader(tar_embedding, batch_size=32, shuffle=True)\n tar_ans = get_class_embedding(model_tar, tar_emb_dataset, device)\n print(tar_ans.shape)\n return model_tar, tar_ans, X_tar_test\n\n\n# model_listä¸æŒ‰é¡ºåºå˜æ”¾äº†model_tar,其顺åºå’Œtest_dataåŠlabelä¸€æ ·ï¼Œtar_emd_listä¸å˜æ”¾æŒ‰ç…§ä¸Šè¿°é¡ºåºçš„类的特å¾å‘é‡\n# ä¿è¯æµ‹è¯•é›†åœ¨æ¯ä¸ªmodelä¸è®ç»ƒé¡ºåºä¸€è‡´,这里先直接把整个testä¾æ¬¡æ‰”进去\ndef all_species_test(model_list, tar_emd_list, test_data, test_label, device):\n y_pred = []\n ans = []\n for i in range(len(model_list)):\n model_tar = model_list[i]\n model_tar.eval()\n with torch.no_grad():\n outputs = []\n for seq in test_data:\n seq = torch.reshape(seq, (-1, 95, 20))\n seq = seq.to(device)\n output = model_tar(seq)\n # 改为计算\n # tar_emd_list为tensor(1900),output为tensor(1,1900)\n cor = stats.pearsonr(tar_emd_list[i], output[0].cpu().numpy())[0]\n outputs.append(cor)\n ans.append(outputs)\n '''\n # 对æ¯ä¸ªtest data进行多模型的得分比较\n for i, label in enumerate(test_label):\n if ans[label][i] == max(ans[label]):\n y_pred.append(label)\n else : \n y = [score[i] for score in ans]\n y_pred.append(y.index(max(y)))'''\n \n for i in range(len(test_label)):\n y = [score[i] for score in ans]\n y_pred.append(y.index(max(y)))\n print('accuracy:', metrics.accuracy_score(test_label, y_pred))\n C = metrics.confusion_matrix(test_label, y_pred)\n print('混淆矩阵:')\n print(C)\n matrix2 = np.array(C)\n matrix2 = matrix2 / matrix2.sum(axis=1)[:, np.newaxis]\n print(matrix2)\n fig = plt.figure()\n sns_plot = sns.heatmap(matrix2, annot=False, cmap=\"YlGnBu\")\n plt.show()\n\nif __name__ == '__main__':\n model_list = []\n tar_emd_list = []\n test_data = []\n test_label = []\n data_path = '/kaggle/input/new-dataset/821.csv'\n ref_path = '/kaggle/input/new-dataset/821.csv'\n # 获å–所有数æ®ï¼Œä¸ºæ¯ä¸€ç±»ç‰©ç§è®ç»ƒone class模型\n df = pd.read_csv(data_path, low_memory=False)\n df = pd.DataFrame(df)\n # 获å–ä¸€å…±æœ‰å‡ ç§ç‰©ç§å’Œç‰©ç§å称\n specie = df['label'].tolist()\n specie = list(set(specie))\n specie.sort()\n for i in range(len(specie)):\n print(specie[i])\n model_tar, tar_ans, X_tar_test = oneclass(data_path, ref_path, specie[i])\n if '/' in specie[i]:\n torch.save(model_tar, specie[i].replace('/','or')+'.model.pt')\n np.save(specie[i].replace('/','or')+'.features.npy', tar_ans)\n else:\n torch.save(model_tar, specie[i] + '.model.pt')\n np.save(specie[i] + '.features.npy', tar_ans)\n model_list.append(model_tar)\n tar_emd_list.append(tar_ans)\n test_data = test_data + X_tar_test\n test_label = test_label + [i]*len(X_tar_test)\n device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n all_species_test(model_list, tar_emd_list, test_data, test_label, device)","metadata":{"execution":{"iopub.status.busy":"2023-08-21T09:50:40.101642Z","iopub.execute_input":"2023-08-21T09:50:40.101996Z","iopub.status.idle":"2023-08-21T09:50:49.708066Z","shell.execute_reply.started":"2023-08-21T09:50:40.101964Z","shell.execute_reply":"2023-08-21T09:50:49.706417Z"},"trusted":true},"execution_count":null,"outputs":[]}]} \ No newline at end of file -- GitLab