From 0240d278969919f89e8604449668504d6b63a30e Mon Sep 17 00:00:00 2001
From: Jingfei Hou <houjf20@mails.tsinghua.edu.cn>
Date: Tue, 10 Oct 2023 08:57:13 +0000
Subject: [PATCH] Upload New File

---
 train_heavy.ipynb | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 train_heavy.ipynb

diff --git a/train_heavy.ipynb b/train_heavy.ipynb
new file mode 100644
index 0000000..ebcbee3
--- /dev/null
+++ b/train_heavy.ipynb
@@ -0,0 +1 @@
+{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-08-21T09:50:39.913747Z","iopub.execute_input":"2023-08-21T09:50:39.914413Z","iopub.status.idle":"2023-08-21T09:50:39.937846Z","shell.execute_reply.started":"2023-08-21T09:50:39.914377Z","shell.execute_reply":"2023-08-21T09:50:39.935978Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import os\nos.environ['KMP_DUPLICATE_LIB_OK']='True'\nimport pandas as pd\nimport torch\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nimport torch.nn as nn\nfrom torch import optim\nfrom torch.utils.data import Dataset\nfrom torch.utils.data import DataLoader\nfrom scipy import stats\nfrom sklearn import metrics\nnp.set_printoptions(threshold=np.inf)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndef get_blosum62_index(char):\n    blosum62 = {\n    'A': [ 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -3, -1, 1, 0, -3, -2, 0],\n    'R': [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],\n    'N': [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],\n    'D': [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -4, -1, 0, -1, -4, -3, -3],\n    'C': [ 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],\n    'Q': [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],\n    'E': [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],\n    'G': [ 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -4, -2, 0, -2, -4, -3, -2],\n    'H': [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],\n    'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],\n    'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],\n    'K': [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2],\n    'M': [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1],\n    'F': [-3, -3, -3, -4, -2, -3, -3, -4, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1],\n    'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2],\n    'S': [ 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],\n    'T': [ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0],\n    'W': [-3, -3, -4, -4, -2, -2, -3, -4, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3],\n    'Y': [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1],\n    'V': [ 0, -3, -3, -3, -1, -2, -2, -2, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4]\n    }\n    keys_list = list(blosum62.keys())\n    return keys_list.index(char)\n\n\ndef one_hot(data : np.ndarray, Is_one_hot = True):\n    # data.shape = (6)\n    amino_acids = np.array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',\n               'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])\n    one_hot = np.zeros((20, 20))\n    for i in range(20):\n        one_hot[i, i] = 1\n    new_data = []\n    for i in range(len(data)):\n        idx = np.where(amino_acids == data[i])\n        new = one_hot[idx[0][0]] if Is_one_hot else idx[0][0]\n        new_data.append(new)\n    return np.asarray(new_data).reshape(-1)\n\n\n# 对氨基酸序列进行编码blosum62\ndef encoding(aa):\n    blosum = np.array([\n        [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],\n        [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],\n        [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],\n        [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],\n        [0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],\n        [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],\n        [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],\n        [0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],\n        [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],\n        [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],\n        [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],\n        [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2],\n        [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1],\n        [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1],\n        [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2],\n        [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],\n        [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0],\n        [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3],\n        [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1],\n        [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4]\n    ])\n    new_seq = []\n    for i in aa:\n        new_seq.append(blosum[get_blosum62_index(i)])\n    flat_list = [item for sublist in new_seq for item in sublist]\n    return flat_list\n\n\n# 读取数据产生ref_list和target_list 参数可指定物种\n# data_path为one_target数据, ref_path为参考集数据\n# ps:有待讨论:如何设置ref和target数据的比例\ndef get_data_list(data_path, ref_path, tar_specie):\n    # 获取参考数据集数据并筛出target物种\n    X_ref, y_ref = [], []\n    X_target, y_target = [], []\n    df_data = pd.read_csv(data_path, low_memory=False)\n    df = pd.DataFrame(df_data)\n    df = df[df['label'].map(lambda x: x == tar_specie)]\n    specie = df['label'].tolist()\n    # 获取一共有几种物种和物种名称\n    specie_set = list(set(specie))\n    specie_set.sort()\n    # 获取序列并拼接,转换成tensor\n    df = df.drop(['y', 'label'], axis=1)\n    df = df.values.tolist()\n    for i in range(len(df)):\n        X_target.append(torch.tensor(df[i]).float())\n        y_target.append(specie_set.index(tar_specie))\n    m1 = len(y_target)\n    if m1 > 500:\n        X_target, X_or, y_target, y_or = train_test_split(X_target, y_target, test_size=1-500/m1, random_state=42)\n    # 获取参考数据集数据并筛出target物种\n    ref_data = pd.read_csv(ref_path, low_memory=False)\n    df_ref = pd.DataFrame(ref_data)\n    df_ref = df_ref[df_ref['label'].map(lambda x: x != tar_specie)]\n    species_ref = df_ref['label'].tolist()\n    # 获取一共有几种物种和物种名称\n    specie_ref_set = list(set(species_ref))\n    specie_ref_set.sort()\n    # 获取序列并拼接,转换成tensor\n    df_ref = df_ref.drop(['y', 'label'], axis=1)\n    df_ref = df_ref.values.tolist()\n    for i in range(len(df_ref)):\n        X_ref.append(torch.tensor(df_ref[i]).float())\n        y_ref.append(specie_ref_set.index(species_ref[i]))\n    return X_ref, y_ref, X_target, y_target\n\n\n# 为参考数据集重新设定label\ndef label_update(y_ref):\n    y_update = []\n    myset = set(y_ref)\n    nums = [i for i in range(len(myset))]\n    nvs = zip(myset, nums)\n    nvDict = dict((name, value) for name, value in nvs)\n    for i in range(len(y_ref)):\n        y_update.append(nvDict[y_ref[i]])\n    return y_update\n\n\n# 将ref_list和target_list转换为dataset\n# 定义自定义数据集类, 传入list\nclass FRDataset(Dataset):\n    def __init__(self, data, labels):\n        self.data_ori = data\n        self.data = data\n        self.labels = labels\n\n    def __len__(self):\n        return len(self.data)\n\n    def __getitem__(self, idx):\n        sample = self.data[idx]\n        label = self.labels[idx]\n        return sample, label\n\n\n# 构建模型和损失函数进行训练,返回训练后的模型/或者sample一些target——train,得到提取后的特征,获得单类特征向量\n# 模型选择,初步使用了transformer的编码器\n# 这里之后要预训练一个transformer,直接调用预训练的参数来初始化\nclass TransEncoder(nn.Module):\n    def __init__(self):\n        super(TransEncoder, self).__init__()\n        # embedding已经在外部写好了\n        # nhead should be divided by d_model\n        encoder_layer = nn.TransformerEncoderLayer(d_model=20, nhead=5)\n        self.TransformerEncoder = nn.TransformerEncoder(encoder_layer, num_layers=6)\n\n    def forward(self, x):\n        # x的shape(batch-size, seq-len95, aa-dim20)\n        out = self.TransformerEncoder(x)\n        # 将三维转变为二维(batch-size, seq-len*aa-dim1900)\n        out = out.view(out.size(0), -1)\n        return out\n\n\n# 定义ref model,两个model共享encoder\nclass RefEncoder(nn.Module):\n    # num_classes需要读取ref的种类\n    def __init__(self, encoder, num_classes, input_dim=1900):\n        super(RefEncoder, self).__init__()\n        self.encoder = encoder\n        # 实际fc层更新?\n        self.fc = nn.Linear(input_dim, num_classes)\n\n    def forward(self, x):\n        # x的shape(batch-size, seq-len95, aa-dim20)\n        out = self.encoder(x)\n        # 将三维转变为二维(batch-size, seq-len*aa-dim1900)\n        # out = out.view(out.size(0), -1)\n        # 过分类头\n        out = self.fc(out)\n        return out\n\n\n# 定义target model,两个model共享encoder\nclass TarEncoder(nn.Module):\n    # num_classes需要读取ref的种类\n    def __init__(self, encoder):\n        super(TarEncoder, self).__init__()\n        self.encoder = encoder\n\n    def forward(self, x):\n        # x的shape(batch-size, seq-len95, aa-dim20)\n        out = self.encoder(x)\n        # 将三维转变为二维(batch-size, seq-len*aa-dim1900)\n        # out = out.view(out.size(0), -1)\n        return out\n\n\n# 交叉熵损失,描述性损失\nclass CrossEntropyWithLogit(nn.Module):\n    def __init__(self) -> None:\n        super().__init__()\n\n    # lamda为控制紧凑性损失和描述性损失比例的参数\n    def forward(self, output:torch.Tensor, labels:torch.Tensor):\n        # labels_one_hot = F.one_hot(labels).float().squeeze()\n        cri = nn.CrossEntropyLoss()\n        loss = cri(output, labels)\n        return loss\n\n\n# 定义紧凑性损失,output的shape为(batch-size, seq-len*aa-dim1900)\n# classes为\ndef compactness_loss(output, batchsize, classes, lamda):\n    m = output.mean(dim=0)\n    #batchsize=25\n    #classes=1900\n    #output.shape=[25,1900]\n    #m.shape=[1900]\n    lc = 0\n    for i in range(output.shape[0]):\n        m1 = (batchsize * m - output[i]) / (batchsize-1)\n        lc = lc + torch.matmul(output[i]-m1, output[i]-m1)\n    lc = lamda * lc / (batchsize*classes)\n    return lc\n\n\n# 训练,此处没有冻结,注意:两个模型共享参数\n# 这里更新参数是在一个epoch之后\ndef train(model_ref: nn.Module,\n          model_tar: nn.Module,\n          ref_loader: DataLoader,\n          tar_loader: DataLoader,\n          optimizer_ref: torch.optim.Optimizer,\n          optimizer_tar: torch.optim.Optimizer,\n          description_loss: nn.Module, device, lamda, epoch):\n    model_ref.train()\n    model_tar.train()\n    lc = []\n    ld = []\n    # 先更新model_tar\n    for seq, label in tar_loader:\n        seq = torch.reshape(seq, (-1, 95, 20))\n        seq = seq.to(device)\n        label = label.to(device)\n        optimizer_tar.zero_grad()\n        output = model_tar(seq)\n        loss_lc = compactness_loss(output, output.shape[0], output.shape[1], lamda)\n        lc.append(float(loss_lc))\n        loss_lc.backward()\n        optimizer_tar.step()\n    # 此时因为共享encoder,所以model_ref中的encoder已经被更新\n    for seq, label in ref_loader:\n        seq = torch.reshape(seq, (-1, 95, 20))\n        seq = seq.to(device)\n        label = label.to(device)\n        optimizer_ref.zero_grad()\n        output = model_ref(seq)\n        loss_ld = description_loss(output, label)\n        ld.append(float(loss_ld))\n        loss_ld.backward()\n        optimizer_ref.step()\n    '''if (epoch + 1) % 2 == 0:\n        total_loss = 0\n        optimizer_ref.zero_grad()\n        lc1 = torch.tensor(lc[-1], requires_grad=True)\n        ld1 = torch.tensor(ld[-1], requires_grad=True)\n        total_loss = lc1 + ld1\n        total_loss.backward()\n        optimizer_ref.step()'''\n    # 此时因为共享encoder,所以model_tar中的encoder已经被更新\n    # print(f\"epoch:{epoch} - batch:{batch_id}, loss:{loss.item():.4f}\")\n    lc = sum(lc)/len(lc)\n    ld = sum(ld) / len(ld)\n    return lc, ld\n\n\n# 获取某类的特征向量\ndef get_class_embedding(model_tar: nn.Module, tar_samples: DataLoader, device):\n    model_tar.eval()\n    with torch.no_grad():\n        tar_embedding = []\n        for seq, label in tar_samples:\n            seq = torch.reshape(seq, (-1, 95, 20))\n            seq = seq.to(device)\n            label = label.to(device)\n            output = model_tar(seq)\n            tar_embedding.append(output.mean(0).cpu().numpy())\n            # .cuda().cpu().numpy()\n        tar_embedding = np.array(tar_embedding)\n        # 此处返回的是numpy类型\n        tar_embedding = np.mean(tar_embedding, axis=0)\n    return tar_embedding\n\n\n# 测试:选取target_test 和其他数据,进行测试\n# 需要多个one class模型测试:model_tar改为多个训练好的模型\ndef oneclass_test(tar_test: DataLoader, model_tar: nn.Module, device):\n    model_tar.eval()\n    with torch.no_grad():\n        outputs = []\n        for seq, label in tar_test:\n            seq = torch.reshape(seq, (-1, 95, 20))\n            seq = seq.to(device)\n            label = label.to(device)\n            output = model_tar(seq)\n            outputs.append(output)\n    # 在函数外再与每个class的特征向量做相似性计算\n    return outputs\n\n\n# 给定物种,调用其他函数划分训练集和参考集,并为训练集和参考集建立模型进行训练,训练之后返回target_model和这个one class对应的特征向量以及测试数据\ndef oneclass(datapath, ref_path, specie):\n    X_ref, y_ref, X_target, y_target = get_data_list(datapath, ref_path, specie)\n    num_class = len(set(y_ref))\n    # 更新参考集label\n    y_ref = label_update(y_ref)\n    ref_loader = FRDataset(X_ref, y_ref)\n    # tar_loader = FRDataset(X_target, y_target)\n    # 划分测试集和训练集\n    # 直接划分list\n    X_tar_train, X_tar_test, y_tar_train, y_tar_test = train_test_split(X_target, y_target, test_size=0.3, random_state=0)\n    train_tar = FRDataset(X_tar_train, y_tar_train)\n    train_ref_dataloader = DataLoader(ref_loader, batch_size=128, shuffle=True)\n    # test_ref_datatest = DataLoader(test_ref, batch_size=32, shuffle=True)\n    train_tar_dataloader = DataLoader(train_tar, batch_size=32, shuffle=True)\n    # test_tar_datatest = DataLoader(test_tar, batch_size=32, shuffle=True)\n    # 创建模型\n    # 这里的encoder采用预训练好的模型\n    encoder_shared = torch.load('/kaggle/input/new-dataset/save_new.pt')\n    # encoder_shared = TransEncoder()\n    model_ref = RefEncoder(encoder_shared, num_classes=num_class, input_dim=1900)\n    model_tar = TarEncoder(encoder_shared)\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    print(device)\n    encoder_shared = encoder_shared.to(device)\n    model_tar = model_tar.to(device)\n    model_ref = model_ref.to(device)\n    description_loss = CrossEntropyWithLogit()\n    # 参数需要修改,这里使用的参数为github上,但github上模型预训练过\n    optimizer_tar = optim.Adam(model_tar.parameters(), lr=0.5e-3, weight_decay=0.00005)\n    optimizer_ref = optim.Adam(model_ref.parameters(), lr=0.5e-3, weight_decay=0.00005)\n    for epoch in range(10):\n        lc, ld = train(model_ref, model_tar, train_ref_dataloader, train_tar_dataloader, optimizer_ref,\n                       optimizer_tar, description_loss, device, 0.1, epoch)\n        print(f\"epoch:{epoch}, lc:{lc}, ld:{ld}\")\n    # 从train_tar_dataset中sample部分用于生成这一类的特征向量\n    train_size = int(len(train_tar) * 1)\n    test_size = len(train_tar) - train_size\n    tar_embedding, tar_other = torch.utils.data.random_split(train_tar, [train_size, test_size],\n                                                        generator=torch.Generator().manual_seed(0))\n    tar_emb_dataset = DataLoader(tar_embedding, batch_size=32, shuffle=True)\n    tar_ans = get_class_embedding(model_tar, tar_emb_dataset, device)\n    print(tar_ans.shape)\n    return model_tar, tar_ans, X_tar_test\n\n\n# model_list中按顺序存放了model_tar,其顺序和test_data及label一样,tar_emd_list中存放按照上述顺序的类的特征向量\n# 保证测试集在每个model中训练顺序一致,这里先直接把整个test依次扔进去\ndef all_species_test(model_list, tar_emd_list, test_data, test_label, device):\n    y_pred = []\n    ans = []\n    for i in range(len(model_list)):\n        model_tar = model_list[i]\n        model_tar.eval()\n        with torch.no_grad():\n            outputs = []\n            for seq in test_data:\n                seq = torch.reshape(seq, (-1, 95, 20))\n                seq = seq.to(device)\n                output = model_tar(seq)\n                # 改为计算\n                # tar_emd_list为tensor(1900),output为tensor(1,1900)\n                cor = stats.pearsonr(tar_emd_list[i], output[0].cpu().numpy())[0]\n                outputs.append(cor)\n        ans.append(outputs)\n    '''\n    # 对每个test data进行多模型的得分比较\n    for i, label in enumerate(test_label):\n        if ans[label][i] == max(ans[label]):\n            y_pred.append(label)\n        else : \n            y = [score[i] for score in ans]\n            y_pred.append(y.index(max(y)))'''\n    \n    for i in range(len(test_label)):\n        y = [score[i] for score in ans]\n        y_pred.append(y.index(max(y)))\n    print('accuracy:', metrics.accuracy_score(test_label, y_pred))\n    C = metrics.confusion_matrix(test_label, y_pred)\n    print('混淆矩阵:')\n    print(C)\n    matrix2 = np.array(C)\n    matrix2 = matrix2 / matrix2.sum(axis=1)[:, np.newaxis]\n    print(matrix2)\n    fig = plt.figure()\n    sns_plot = sns.heatmap(matrix2, annot=False, cmap=\"YlGnBu\")\n    plt.show()\n\nif __name__ == '__main__':\n    model_list = []\n    tar_emd_list = []\n    test_data = []\n    test_label = []\n    data_path = '/kaggle/input/new-dataset/821.csv'\n    ref_path = '/kaggle/input/new-dataset/821.csv'\n    # 获取所有数据,为每一类物种训练one class模型\n    df = pd.read_csv(data_path, low_memory=False)\n    df = pd.DataFrame(df)\n    # 获取一共有几种物种和物种名称\n    specie = df['label'].tolist()\n    specie = list(set(specie))\n    specie.sort()\n    for i in range(len(specie)):\n        print(specie[i])\n        model_tar, tar_ans, X_tar_test = oneclass(data_path, ref_path, specie[i])\n        if '/' in specie[i]:\n            torch.save(model_tar, specie[i].replace('/','or')+'.model.pt')\n            np.save(specie[i].replace('/','or')+'.features.npy', tar_ans)\n        else:\n            torch.save(model_tar, specie[i] + '.model.pt')\n            np.save(specie[i] + '.features.npy', tar_ans)\n        model_list.append(model_tar)\n        tar_emd_list.append(tar_ans)\n        test_data = test_data + X_tar_test\n        test_label = test_label + [i]*len(X_tar_test)\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    all_species_test(model_list, tar_emd_list, test_data, test_label, device)","metadata":{"execution":{"iopub.status.busy":"2023-08-21T09:50:40.101642Z","iopub.execute_input":"2023-08-21T09:50:40.101996Z","iopub.status.idle":"2023-08-21T09:50:49.708066Z","shell.execute_reply.started":"2023-08-21T09:50:40.101964Z","shell.execute_reply":"2023-08-21T09:50:49.706417Z"},"trusted":true},"execution_count":null,"outputs":[]}]}
\ No newline at end of file
-- 
GitLab