Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Tsinghua-A
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Model registry
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
2023 Competition
Software Tools
Tsinghua-A
Commits
38e902e4
Commit
38e902e4
authored
1 year ago
by
Ziqian
Browse files
Options
Downloads
Patches
Plain Diff
encoder
parent
7a7d5a9d
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
encoder.py
+116
-0
116 additions, 0 deletions
encoder.py
with
116 additions
and
0 deletions
encoder.py
0 → 100644
+
116
−
0
View file @
38e902e4
import
numpy
as
np
import
torch
import
pandas
as
pd
from
itertools
import
product
from
scipy
import
stats
from
torch
import
nn
def
get_blosum62_index
(
char
):
blosum62
=
{
'
A
'
:
[
4
,
-
1
,
-
2
,
-
2
,
0
,
-
1
,
-
1
,
0
,
-
2
,
-
1
,
-
1
,
-
1
,
-
1
,
-
3
,
-
1
,
1
,
0
,
-
3
,
-
2
,
0
],
'
R
'
:
[
-
1
,
5
,
0
,
-
2
,
-
3
,
1
,
0
,
-
2
,
0
,
-
3
,
-
2
,
2
,
-
1
,
-
3
,
-
2
,
-
1
,
-
1
,
-
3
,
-
2
,
-
3
],
'
N
'
:
[
-
2
,
0
,
6
,
1
,
-
3
,
0
,
0
,
0
,
1
,
-
3
,
-
3
,
0
,
-
2
,
-
3
,
-
2
,
1
,
0
,
-
4
,
-
2
,
-
3
],
'
D
'
:
[
-
2
,
-
2
,
1
,
6
,
-
3
,
0
,
2
,
-
1
,
-
1
,
-
3
,
-
4
,
-
1
,
-
3
,
-
4
,
-
1
,
0
,
-
1
,
-
4
,
-
3
,
-
3
],
'
C
'
:
[
0
,
-
3
,
-
3
,
-
3
,
9
,
-
3
,
-
4
,
-
3
,
-
3
,
-
1
,
-
1
,
-
3
,
-
1
,
-
2
,
-
3
,
-
1
,
-
1
,
-
2
,
-
2
,
-
1
],
'
Q
'
:
[
-
1
,
1
,
0
,
0
,
-
3
,
5
,
2
,
-
2
,
0
,
-
3
,
-
2
,
1
,
0
,
-
3
,
-
1
,
0
,
-
1
,
-
2
,
-
1
,
-
2
],
'
E
'
:
[
-
1
,
0
,
0
,
2
,
-
4
,
2
,
5
,
-
2
,
0
,
-
3
,
-
3
,
1
,
-
2
,
-
3
,
-
1
,
0
,
-
1
,
-
3
,
-
2
,
-
2
],
'
G
'
:
[
0
,
-
2
,
0
,
-
1
,
-
3
,
-
2
,
-
2
,
6
,
-
2
,
-
4
,
-
4
,
-
2
,
-
3
,
-
4
,
-
2
,
0
,
-
2
,
-
4
,
-
3
,
-
2
],
'
H
'
:
[
-
2
,
0
,
1
,
-
1
,
-
3
,
0
,
0
,
-
2
,
8
,
-
3
,
-
3
,
-
1
,
-
2
,
-
1
,
-
2
,
-
1
,
-
2
,
-
2
,
2
,
-
3
],
'
I
'
:
[
-
1
,
-
3
,
-
3
,
-
3
,
-
1
,
-
3
,
-
3
,
-
4
,
-
3
,
4
,
2
,
-
3
,
1
,
0
,
-
3
,
-
2
,
-
1
,
-
3
,
-
1
,
3
],
'
L
'
:
[
-
1
,
-
2
,
-
3
,
-
4
,
-
1
,
-
2
,
-
3
,
-
4
,
-
3
,
2
,
4
,
-
2
,
2
,
0
,
-
3
,
-
2
,
-
1
,
-
2
,
-
1
,
1
],
'
K
'
:
[
-
1
,
2
,
0
,
-
1
,
-
3
,
1
,
1
,
-
2
,
-
1
,
-
3
,
-
2
,
5
,
-
1
,
-
3
,
-
1
,
0
,
-
1
,
-
3
,
-
2
,
-
2
],
'
M
'
:
[
-
1
,
-
1
,
-
2
,
-
3
,
-
1
,
0
,
-
2
,
-
3
,
-
2
,
1
,
2
,
-
1
,
5
,
0
,
-
2
,
-
1
,
-
1
,
-
1
,
-
1
,
1
],
'
F
'
:
[
-
3
,
-
3
,
-
3
,
-
4
,
-
2
,
-
3
,
-
3
,
-
4
,
-
1
,
0
,
0
,
-
3
,
0
,
6
,
-
4
,
-
2
,
-
2
,
1
,
3
,
-
1
],
'
P
'
:
[
-
1
,
-
2
,
-
2
,
-
1
,
-
3
,
-
1
,
-
1
,
-
2
,
-
2
,
-
3
,
-
3
,
-
1
,
-
2
,
-
4
,
7
,
-
1
,
-
1
,
-
4
,
-
3
,
-
2
],
'
S
'
:
[
1
,
-
1
,
1
,
0
,
-
1
,
0
,
0
,
0
,
-
1
,
-
2
,
-
2
,
0
,
-
1
,
-
2
,
-
1
,
4
,
1
,
-
3
,
-
2
,
-
2
],
'
T
'
:
[
0
,
-
1
,
0
,
-
1
,
-
1
,
-
1
,
-
1
,
-
2
,
-
2
,
-
1
,
-
1
,
-
1
,
-
1
,
-
2
,
-
1
,
1
,
5
,
-
2
,
-
2
,
0
],
'
W
'
:
[
-
3
,
-
3
,
-
4
,
-
4
,
-
2
,
-
2
,
-
3
,
-
4
,
-
2
,
-
3
,
-
2
,
-
3
,
-
1
,
1
,
-
4
,
-
3
,
-
2
,
11
,
2
,
-
3
],
'
Y
'
:
[
-
2
,
-
2
,
-
2
,
-
3
,
-
2
,
-
1
,
-
2
,
-
3
,
2
,
-
1
,
-
1
,
-
2
,
-
1
,
3
,
-
3
,
-
2
,
-
2
,
2
,
7
,
-
1
],
'
V
'
:
[
0
,
-
3
,
-
3
,
-
3
,
-
1
,
-
2
,
-
2
,
-
2
,
-
3
,
3
,
1
,
-
2
,
1
,
-
1
,
-
2
,
-
2
,
0
,
-
3
,
-
1
,
4
]
}
keys_list
=
list
(
blosum62
.
keys
())
return
keys_list
.
index
(
char
)
# 对氨基酸序列进行编码blosum62
def
encoding
(
aa
):
blosum
=
np
.
array
([
[
4
,
-
1
,
-
2
,
-
2
,
0
,
-
1
,
-
1
,
0
,
-
2
,
-
1
,
-
1
,
-
1
,
-
1
,
-
2
,
-
1
,
1
,
0
,
-
3
,
-
2
,
0
],
[
-
1
,
5
,
0
,
-
2
,
-
3
,
1
,
0
,
-
2
,
0
,
-
3
,
-
2
,
2
,
-
1
,
-
3
,
-
2
,
-
1
,
-
1
,
-
3
,
-
2
,
-
3
],
[
-
2
,
0
,
6
,
1
,
-
3
,
0
,
0
,
0
,
1
,
-
3
,
-
3
,
0
,
-
2
,
-
3
,
-
2
,
1
,
0
,
-
4
,
-
2
,
-
3
],
[
-
2
,
-
2
,
1
,
6
,
-
3
,
0
,
2
,
-
1
,
-
1
,
-
3
,
-
4
,
-
1
,
-
3
,
-
3
,
-
1
,
0
,
-
1
,
-
4
,
-
3
,
-
3
],
[
0
,
-
3
,
-
3
,
-
3
,
9
,
-
3
,
-
4
,
-
3
,
-
3
,
-
1
,
-
1
,
-
3
,
-
1
,
-
2
,
-
3
,
-
1
,
-
1
,
-
2
,
-
2
,
-
1
],
[
-
1
,
1
,
0
,
0
,
-
3
,
5
,
2
,
-
2
,
0
,
-
3
,
-
2
,
1
,
0
,
-
3
,
-
1
,
0
,
-
1
,
-
2
,
-
1
,
-
2
],
[
-
1
,
0
,
0
,
2
,
-
4
,
2
,
5
,
-
2
,
0
,
-
3
,
-
3
,
1
,
-
2
,
-
3
,
-
1
,
0
,
-
1
,
-
3
,
-
2
,
-
2
],
[
0
,
-
2
,
0
,
-
1
,
-
3
,
-
2
,
-
2
,
6
,
-
2
,
-
4
,
-
4
,
-
2
,
-
3
,
-
3
,
-
2
,
0
,
-
2
,
-
2
,
-
3
,
-
3
],
[
-
2
,
0
,
1
,
-
1
,
-
3
,
0
,
0
,
-
2
,
8
,
-
3
,
-
3
,
-
1
,
-
2
,
-
1
,
-
2
,
-
1
,
-
2
,
-
2
,
2
,
-
3
],
[
-
1
,
-
3
,
-
3
,
-
3
,
-
1
,
-
3
,
-
3
,
-
4
,
-
3
,
4
,
2
,
-
3
,
1
,
0
,
-
3
,
-
2
,
-
1
,
-
3
,
-
1
,
3
],
[
-
1
,
-
2
,
-
3
,
-
4
,
-
1
,
-
2
,
-
3
,
-
4
,
-
3
,
2
,
4
,
-
2
,
2
,
0
,
-
3
,
-
2
,
-
1
,
-
2
,
-
1
,
1
],
[
-
1
,
2
,
0
,
-
1
,
-
3
,
1
,
1
,
-
2
,
-
1
,
-
3
,
-
2
,
5
,
-
1
,
-
3
,
-
1
,
0
,
-
1
,
-
3
,
-
2
,
-
2
],
[
-
1
,
-
1
,
-
2
,
-
3
,
-
1
,
0
,
-
2
,
-
3
,
-
2
,
1
,
2
,
-
1
,
5
,
0
,
-
2
,
-
1
,
-
1
,
-
1
,
-
1
,
1
],
[
-
2
,
-
3
,
-
3
,
-
3
,
-
2
,
-
3
,
-
3
,
-
3
,
-
1
,
0
,
0
,
-
3
,
0
,
6
,
-
4
,
-
2
,
-
2
,
1
,
3
,
-
1
],
[
-
1
,
-
2
,
-
2
,
-
1
,
-
3
,
-
1
,
-
1
,
-
2
,
-
2
,
-
3
,
-
3
,
-
1
,
-
2
,
-
4
,
7
,
-
1
,
-
1
,
-
4
,
-
3
,
-
2
],
[
1
,
-
1
,
1
,
0
,
-
1
,
0
,
0
,
0
,
-
1
,
-
2
,
-
2
,
0
,
-
1
,
-
2
,
-
1
,
4
,
1
,
-
3
,
-
2
,
-
2
],
[
0
,
-
1
,
0
,
-
1
,
-
1
,
-
1
,
-
1
,
-
2
,
-
2
,
-
1
,
-
1
,
-
1
,
-
1
,
-
2
,
-
1
,
1
,
5
,
-
2
,
-
2
,
0
],
[
-
3
,
-
3
,
-
4
,
-
4
,
-
2
,
-
2
,
-
3
,
-
2
,
-
2
,
-
3
,
-
2
,
-
3
,
-
1
,
1
,
-
4
,
-
3
,
-
2
,
11
,
2
,
-
3
],
[
-
2
,
-
2
,
-
2
,
-
3
,
-
2
,
-
1
,
-
2
,
-
3
,
2
,
-
1
,
-
1
,
-
2
,
-
1
,
3
,
-
3
,
-
2
,
-
2
,
2
,
7
,
-
1
],
[
0
,
-
3
,
-
3
,
-
3
,
-
1
,
-
2
,
-
2
,
-
3
,
-
3
,
3
,
1
,
-
2
,
1
,
-
1
,
-
2
,
-
2
,
0
,
-
3
,
-
1
,
4
]
])
new_seq
=
[]
for
i
in
aa
:
new_seq
.
append
(
blosum
[
get_blosum62_index
(
i
)])
flat_list
=
[
item
for
sublist
in
new_seq
for
item
in
sublist
]
return
flat_list
# 构建模型和损失函数进行训练,返回训练后的模型/或者sample一些target——train,得到提取后的特征,获得单类特征向量
# 模型选择,初步使用了transformer的编码器
# 这里之后要预训练一个transformer,直接调用预训练的参数来初始化
class
TransEncoder
(
nn
.
Module
):
def
__init__
(
self
):
super
(
TransEncoder
,
self
).
__init__
()
# embedding已经在外部写好了
# nhead should be divided by d_model
encoder_layer
=
nn
.
TransformerEncoderLayer
(
d_model
=
20
,
nhead
=
5
)
self
.
TransformerEncoder
=
nn
.
TransformerEncoder
(
encoder_layer
,
num_layers
=
6
)
def
forward
(
self
,
x
):
# x的shape(batch-size, seq-len95, aa-dim20)
out
=
self
.
TransformerEncoder
(
x
)
# 将三维转变为二维(batch-size, seq-len*aa-dim1900)
out
=
out
.
view
(
out
.
size
(
0
),
-
1
)
return
out
# 定义ref model,两个model共享encoder
class
RefEncoder
(
nn
.
Module
):
# num_classes需要读取ref的种类
def
__init__
(
self
,
encoder
,
num_classes
,
input_dim
=
1900
):
super
(
RefEncoder
,
self
).
__init__
()
self
.
encoder
=
encoder
# 实际fc层更新?
self
.
fc
=
nn
.
Linear
(
input_dim
,
num_classes
)
def
forward
(
self
,
x
):
# x的shape(batch-size, seq-len95, aa-dim20)
out
=
self
.
encoder
(
x
)
# 将三维转变为二维(batch-size, seq-len*aa-dim1900)
# out = out.view(out.size(0), -1)
# 过分类头
out
=
self
.
fc
(
out
)
return
out
# 定义target model,两个model共享encoder
class
TarEncoder
(
nn
.
Module
):
# num_classes需要读取ref的种类
def
__init__
(
self
,
encoder
):
super
(
TarEncoder
,
self
).
__init__
()
self
.
encoder
=
encoder
def
forward
(
self
,
x
):
# x的shape(batch-size, seq-len95, aa-dim20)
out
=
self
.
encoder
(
x
)
# 将三维转变为二维(batch-size, seq-len*aa-dim1900)
# out = out.view(out.size(0), -1)
return
out
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment