encoder

38e902e4 · Ziqian · 7a7d5a9d · 38e902e4
Commit 38e902e4 authored 1 year ago by Ziqian
--- a/encoder.py
+++ b/encoder.py
+import numpy as np
+import torch
+import pandas as pd
+from itertools import product
+from scipy import stats
+from torch import nn
+
+def get_blosum62_index(char):
+    blosum62 = {
+    'A': [ 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -3, -1, 1, 0, -3, -2, 0],
+    'R': [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],
+    'N': [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],
+    'D': [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -4, -1, 0, -1, -4, -3, -3],
+    'C': [ 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
+    'Q': [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],
+    'E': [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],
+    'G': [ 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -4, -2, 0, -2, -4, -3, -2],
+    'H': [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],
+    'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],
+    'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],
+    'K': [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2],
+    'M': [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1],
+    'F': [-3, -3, -3, -4, -2, -3, -3, -4, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1],
+    'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2],
+    'S': [ 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],
+    'T': [ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0],
+    'W': [-3, -3, -4, -4, -2, -2, -3, -4, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3],
+    'Y': [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1],
+    'V': [ 0, -3, -3, -3, -1, -2, -2, -2, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4]
+    }
+    keys_list = list(blosum62.keys())
+    return keys_list.index(char)
+
+
+# 对氨基酸序列进行编码blosum62
+def encoding(aa):
+    blosum = np.array([
+        [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],
+        [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],
+        [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],
+        [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],
+        [0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
+        [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],
+        [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],
+        [0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],
+        [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],
+        [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],
+        [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],
+        [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2],
+        [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1],
+        [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1],
+        [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2],
+        [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],
+        [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0],
+        [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3],
+        [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1],
+        [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4]
+    ])
+    new_seq = []
+    for i in aa:
+        new_seq.append(blosum[get_blosum62_index(i)])
+    flat_list = [item for sublist in new_seq for item in sublist]
+    return flat_list
+
+
+# 构建模型和损失函数进行训练，返回训练后的模型/或者sample一些target——train，得到提取后的特征，获得单类特征向量
+# 模型选择，初步使用了transformer的编码器
+# 这里之后要预训练一个transformer，直接调用预训练的参数来初始化
+class TransEncoder(nn.Module):
+    def __init__(self):
+        super(TransEncoder, self).__init__()
+        # embedding已经在外部写好了
+        # nhead should be divided by d_model
+        encoder_layer = nn.TransformerEncoderLayer(d_model=20, nhead=5)
+        self.TransformerEncoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
+
+    def forward(self, x):
+        # x的shape(batch-size, seq-len95, aa-dim20)
+        out = self.TransformerEncoder(x)
+        # 将三维转变为二维(batch-size, seq-len*aa-dim1900)
+        out = out.view(out.size(0), -1)
+        return out
+
+
+# 定义ref model,两个model共享encoder
+class RefEncoder(nn.Module):
+    # num_classes需要读取ref的种类
+    def __init__(self, encoder, num_classes, input_dim=1900):
+        super(RefEncoder, self).__init__()
+        self.encoder = encoder
+        # 实际fc层更新？
+        self.fc = nn.Linear(input_dim, num_classes)
+
+    def forward(self, x):
+        # x的shape(batch-size, seq-len95, aa-dim20)
+        out = self.encoder(x)
+        # 将三维转变为二维(batch-size, seq-len*aa-dim1900)
+        # out = out.view(out.size(0), -1)
+        # 过分类头
+        out = self.fc(out)
+        return out
+
+
+# 定义target model,两个model共享encoder
+class TarEncoder(nn.Module):
+    # num_classes需要读取ref的种类
+    def __init__(self, encoder):
+        super(TarEncoder, self).__init__()
+        self.encoder = encoder
+
+    def forward(self, x):
+        # x的shape(batch-size, seq-len95, aa-dim20)
+        out = self.encoder(x)
+        # 将三维转变为二维(batch-size, seq-len*aa-dim1900)
+        # out = out.view(out.size(0), -1)
+        return out