本文仅将我用来学习的代码做一个备份,并且加入深入的理解,以便入门机器翻译。

1.0版本学习(初赛)

预处理语料

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import sys
import jieba

# name 执行PreProcess.py xxx中的xxx
name = sys.argv[1]
# 以_为分隔符,分隔两种语言,并将其定义为语言一,语言二
L1,L2 = name.split('_')

# 1. 将name.train中的L1, L2分别提取出来
def extract_L1_from_train(text_dir = 'datasets/'+name+'.train',save_dir = 'datasets/'+L1+'.txt'):
# 打开新建文本L_xx.txt,提供写入权限,命名为fw
with open(save_dir,'w', encoding='utf-8') as fw:
# 打开文本xxx.train,提供只读权限,命名为f
with open(text_dir,'r', encoding='utf-8') as f:
# f逐行读取,每行为item
for item in f.readlines():
# 以\t为条件分隔,存入第一行第一句中文为sentence
sentence = item.split('\t')[0]
'''
if 'zh' in save_dir:
sentence = ' '.join(jieba.cut(sentence))
'''
# 写入第一句中文到f中+换行
fw.write(sentence+'\n')
# 执行函数
extract_L1_from_train()

def extract_L2_from_train(text_dir = 'datasets/'+name+'.train',save_dir = 'datasets/'+L2+'.txt'):
with open(save_dir,'w', encoding='utf-8') as fw:
with open(text_dir,'r', encoding='utf-8') as f:
for item in f.readlines():
# 这里如果直接split('\t')[1]会有一个换行,因此write里面的\n取消
sentence = item.split('\t')[1]
'''
if 'zh' in save_dir:
sentence = ' '.join(jieba.cut(sentence))
'''
fw.write(sentence)

extract_L2_from_train()

# 2. 使用subword-nmt生成词典,并且按照词典进行分词
# os.system ('command'),这个函数是用来执行windows下的cmd命令的的,其中的command就是cmd命令
import os
'''
bpe(Byte-Pair Encoding)讲解博客
https://blog.csdn.net/qq_27590277/article/details/88343988?spm=1001.2101.3001.6661.1&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-1-88343988-blog-103811328.t5_layer_eslanding_S_4
https://blog.csdn.net/weixin_38877987/article/details/118217314?spm=1001.2101.3001.6650.2&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-2-118217314-blog-88343988.pc_relevant_multi_platform_whitelistv6
subword-nmt分词:里面有参数,和cmd命令写法介绍
https://github.com/rsennrich/subword-nmt

'''

# 造词库和bpe文件
os.system('subword-nmt learn-joint-bpe-and-vocab \
--input datasets/'+L1+'.txt \
-s 2048 \
-o datasets/'+L1+'.bpe \
--write-vocabulary datasets/'+L1+'.vocab')

os.system('subword-nmt learn-joint-bpe-and-vocab \
--input datasets/'+L2+'.txt \
-s 2048 \
-o datasets/'+L2+'.bpe \
--write-vocabulary datasets/'+L2+'.vocab')

# 运用bpe文件对原文进行分词
os.system('subword-nmt apply-bpe -c datasets/'+L1+'.bpe < datasets/'+L1+'.txt > datasets/bpe_'+L1+'.txt')
os.system('subword-nmt apply-bpe -c datasets/'+L2+'.bpe < datasets/'+L2+'.txt > datasets/bpe_'+L2+'.txt')

# 3. 将所有BPE分词后的句子转化为索引句子
class IDmanager:
def __init__(self, vocab_dir = 'datasets/zh.vocab', thre = 5):
# 人为定义了词典的四个词汇
# '<unk>':0 , '<s>':1 , '<e>':2 , '<pad>':3
self.vocab = ['<unk>', '<s>', '<e>', '<pad>']
with open(vocab_dir,'r', encoding='utf-8') as f:
for item in f.readlines():
# 得到每一条单词+词典编号
word, num = item.split(' ')
# split返回值默认字符串,强转为int
# num[:-1]其实就是完整的数值,需要预留四个编号给自己定义的vocab
# 这里有个trick,最开始4个已经被丢弃,那么我们可不可以在追加完以后再加入那四个
if int(num[:-1])>=thre:
# 词典追加单词,此时vocab列表的序号为我们这个词典的正式序号
self.vocab.append(word)

def word2id(self, sentence):
s = sentence.split(' ')
r = []
# r = ''
for item in s:
try:
# 如果这个单词存在于我们的vocab,返回他的index
i = self.vocab.index(item)
except:
# 万一句子不在vocab里,做异常处理(0号为'<unk>')
i = 0
# 将这个句子从文字翻译成id序列
r.append(str(i))
# 这样不行,可能会有中间的item和最后一个s[-1]是同一个
# if item != s[-1]:
# r += str(i) + ' '
# else:
# r += str(i)
return r

manager = IDmanager(vocab_dir = 'datasets/'+L1+'.vocab')

# 从bpe分词好后的结果变为id分词结果
with open('datasets/'+L1+'.bos_id','w', encoding='utf-8') as fw:
with open('datasets/bpe_'+L1+'.txt', 'r', encoding='utf-8') as f:
for line in f.readlines():
# readlines读取的是列表,每一行为一个元素(字符串)
# 逐行写入,将bpe结果中'@@ '的分词变为' '分词
# 得到id句子
# 以' '连接列表
fw.write(' '.join(manager.word2id(line[:-1]))+'\n')

manager = IDmanager(vocab_dir = 'datasets/'+L2+'.vocab')

with open('datasets/'+L2+'.bos_id','w', encoding='utf-8') as fw:
with open('datasets/bpe_'+L2+'.txt', 'r', encoding='utf-8') as f:
for line in f.readlines():
fw.write(' '.join(manager.word2id(line[:-1]))+'\n')


构建模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import paddle
from paddle.nn import Transformer
# 讲解transformer
# https://blog.csdn.net/Tink1995/article/details/105012972
# https://blog.csdn.net/Tink1995/article/details/105080033
class MyNet(paddle.nn.Layer):
def __init__(self, emb_dim = 512, n_head = 8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, max_len = 52, L1_vocab_size = 7602, L2_vocab_size = 4134):
super(MyNet,self).__init__()
# 7602和4134是字典的大小,查看/dataset/zh.vocab的行数即可
self.emb_src = paddle.nn.Embedding(L1_vocab_size+4, emb_dim, sparse=True)
self.emb_tgt = paddle.nn.Embedding(L2_vocab_size+4, emb_dim, sparse=True)
self.transformer = paddle.nn.Transformer(d_model=emb_dim,
nhead=n_head,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dim_feedforward=dim_feedforward,
dropout=0.1,
activation='relu',
attn_dropout=None,
act_dropout=None,
normalize_before=True,
weight_attr=None,
bias_attr=None,
custom_encoder=None,
custom_decoder=None)
self.mask = self.transformer.generate_square_subsequent_mask(max_len)
self.fc = paddle.nn.Linear(emb_dim,L2_vocab_size+4)
# self.softmax = paddle.nn.Softmax()
def forward(self, src, tgt):
src = self.emb_src(src)
tgt = self.emb_tgt(tgt)

x = self.transformer(src, tgt, self.mask, self.mask, self.mask)
out = self.fc(x)
# out = self.softmax(x)
return out

paddle.summary(MyNet(),((1,52),(1,52)),dtypes='int64')

数据读取器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import paddle

class MyDateset(paddle.io.Dataset):
def __init__(self, mode = 'train', src_dir = 'datasets/zh.bos_id', tgt_dir = 'datasets/th.bos_id', max_len = 52):
super(MyDateset, self).__init__()

self.mode = mode
self.src = []
with open(src_dir, 'r') as f:
for item in f.readlines():
# 每一个句子读入,以空格为间隔,一个句子为列表中的一个元素
self.src.append([int(x) for x in item.split(' ')])
self.tgt = []
with open(tgt_dir, 'r') as f:
for item in f.readlines():
self.tgt.append([int(x) for x in item.split(' ')])
# 句子最大长度
self.max_len = max_len

def __getitem__(self, index):
#得到对应索引的行的src和tgt
src = self.src[index]
tgt = self.tgt[index]

if len(src)+1<self.max_len:
# 类似于padding操作,将每一句自动补全为52,以2为间断点
# 列表相加为列表连接,逗号分隔。
# src = [3] * 2
# src: [3, 3]
src = src + [2] + (self.max_len - len(src) - 1) * [3]
else:
# 多余52位的,留下51位,给最后一位补上[2]
# [2]为'<e>' :end
# 列表[:-1]左闭右开,取第一位到倒数第二位
src = src[:self.max_len-1]
src = src + [2]
if len(tgt)+1<self.max_len:
lbl = tgt + [2] +(self.max_len - len(tgt) - 1) * [3]
else:
lbl = lbl[:self.max_len]
lbl = tgt + [2]

# [1]为'<s>' :start
# 加入标签为句首
# lbl就是没有这个句首符号,tgt就是有
tgt = [1] + tgt
if len(tgt)+1<self.max_len:
tgt = tgt + [2] +(self.max_len - len(tgt) - 1) * [3]
# tgt = (self.max_len - len(tgt) - 1) * [3] + tgt + [2]
else:
tgt = tgt[:self.max_len]
tgt = tgt + [2]

src = paddle.to_tensor(src).astype('int64')
tgt = paddle.to_tensor(tgt).astype('int64')
lbl = paddle.to_tensor(lbl).astype('int64')

return src, tgt, lbl

def __len__(self):
return len(self.src)

# 对dataloader进行测试
if 1:
train_dataset=MyDateset()

# batch_size等于16,就是说一批次训练执行16次getitem,也就是每一批次得到16个样本
# drop_last就是是否丢掉最后不成一批的样本
train_dataloader = paddle.io.DataLoader(
train_dataset,
batch_size=16,
shuffle=True,
drop_last=False)

# enumerate迭代一次,调用一次train_dataloader
for step, data in enumerate(train_dataloader):
src, tgt, lbl = data
print(step, src.shape, tgt.shape, lbl.shape)
print('src:',src[0])
print('tgt:',tgt[0])
print('lbl:',lbl[0])
break

训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def train(dircetion = 'zh_fr', learning_rate = 0.001, n_head = 4,
num_encoder_layers=1, num_decoder_layers=1, max_epoch = 5,
emb_dim=512, dim_feedforward = 256):

L1,L2 = dircetion.split('_')
with open('datasets/'+L1+'.vocab') as f:
L1_vocab_size = len(f.readlines())
with open('datasets/'+L2+'.vocab') as f:
L2_vocab_size = len(f.readlines())

model = MyNet(emb_dim = emb_dim, n_head = n_head, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, L1_vocab_size = L1_vocab_size, L2_vocab_size = L2_vocab_size)
# .train()使得模型进入训练模式,此时权值动态调整
model.train()

# 需要接续之前的模型重复训练可以设为if 1
if 1:
try:
param_dict = paddle.load(dircetion+'.pdparams')
model.load_dict(param_dict)
except:
print('no such pdparams')

train_dataset=MyDateset(mode = 'train', src_dir = 'datasets/'+L1+'.bos_id', tgt_dir = 'datasets/'+L2+'.bos_id')
train_dataloader = paddle.io.DataLoader(
train_dataset,
batch_size=256,
shuffle=True,
drop_last=False)

max_epoch=max_epoch
# 构建优化器,余弦退火自动调节lr+adam优化
scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=learning_rate, T_max=max_epoch)
opt = paddle.optimizer.Adam(learning_rate=scheduler, parameters=model.parameters())

now_step=0
min_loss = 999
for epoch in range(max_epoch):
for step, data in enumerate(train_dataloader):
now_step+=1

src, tgt, lbl = data
pre = model(src, tgt)
loss = paddle.nn.functional.cross_entropy(pre,lbl).mean()
loss.backward()
opt.step()
opt.clear_gradients()
if now_step%10==0:
print("epoch: {}, batch: {}, loss is: {}".format(epoch, step, loss.mean().numpy()))
if loss < min_loss :
min_loss = loss
paddle.save(model.state_dict(), dircetion+'.pdparams')

预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import numpy as np
# import jieba

def predict(direction = 'zh_fr', n_head = 4,
num_encoder_layers=1, num_decoder_layers=1,
emb_dim=512, dim_feedforward=256):
L1,L2 = direction.split('_')
# len(f.readlines())得到词典的全部行数
with open('datasets/'+L1+'.vocab') as f:
L1_vocab_size = len(f.readlines())
with open('datasets/'+L2+'.vocab') as f:
L2_vocab_size = len(f.readlines())

# 读取模型
model = MyNet(emb_dim = emb_dim, n_head = n_head, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, L1_vocab_size = L1_vocab_size, L2_vocab_size = L2_vocab_size)
if 1:
param_dict = paddle.load(direction+'.pdparams')
model.load_dict(param_dict)
# 使得模型进入预测模式,此时模型内权值不会发生改变
model.eval()

# 打开目标文件,用于存入翻译结果
with open(direction+'.rst','w') as output_file:
# 打开测试输入文件
with open('datasets/'+direction+'.test') as input_file:
# 读取句子
for sentence in input_file.readlines():
'''
if L1 == 'zh':
sentence = ' '.join(jieba.cut(sentence))
'''
# 由于subword-nmt不能通过notebook界面运行,只能通过另行其道建个文件做分词啦~
# 测试集数据写入tmp.txt,并用subword-nmt进行分词
# 分词后结果长这样: 4星@@ 暴@@ 怒@@ 耳@@ 环
with open('tmp.txt','w') as f:
f.write(sentence)
os.system('subword-nmt apply-bpe -c datasets/'+L1+'.bpe < tmp.txt > tmp2.txt')

# 这个IDmanager不仅可以word2id还可以id2word~
class IDmanager:
def __init__(self, vocab_dir = 'datasets/zh.vocab', thre = 1):
self.vocab = ['<unk>', '<s>', '<e>', '<pad>']
with open(vocab_dir,'r') as f:
for item in f.readlines():
word, num = item.split(' ')
if int(num[:-1])>=thre:
self.vocab.append(word)

def word2id(self, sentence):
s = sentence.split(' ')
r = []
for item in s:
try:
i = self.vocab.index(item)
except:
i = 0
r.append(str(i))
return r

def id2word(self, bos_id_list):
r = []
for item in bos_id_list:
r.append(self.vocab[item])
return r

# 将分好词的结果转化为id
manager = IDmanager(vocab_dir = 'datasets/'+L1+'.vocab')
# 目前正在for sentence in input_file.readlines():迭代里,对每一句话执行tmp tmp2操作
with open('tmp2.txt','r') as f:
line = f.readlines()[0]
# print(line)
# 继续构建bos_id_list的id句子列表
bos_id_list = manager.word2id(line)

# 临时文件已经完成使命了,可以删除啦~
os.system('rm -rf tmp.txt')
os.system('rm -rf tmp2.txt')

# 输出看看!
# print(bos_id_list)

# 保证每次输入都是52,正如在开头处所表明的,第k位的输出只受到0到k-1位输入的影响,所以即使不知道tgt是什么,也没关系,从第一位是s慢慢解码即可
def PADtoMax(bos_id, max_len = 52, with_s = 0):
if with_s:
bos_id = [1] + bos_id
if len(bos_id)+1<max_len:
bos_id = bos_id + [2] +(max_len - len(bos_id) - 1) * [3]
else:
bos_id = bos_id + [2]
bos_id = bos_id[:max_len]
return bos_id

max_len = 52
# 补码
src_id = paddle.to_tensor(PADtoMax(bos_id_list),dtype = 'int64')
src_id = src_id.reshape([1]+src_id.shape)

# 循环解码
raw_tgt_id = []
for i in range(max_len):
tgt_id = PADtoMax(raw_tgt_id, max_len = 52, with_s = 1)
# print(tgt_id)
tgt_id = paddle.to_tensor(tgt_id,dtype = 'int64')
tgt_id = tgt_id.reshape([1]+tgt_id.shape)
pre = model(src_id, tgt_id)
new_id = np.argmax(pre[0][i])
# 解码得到的内容放入tgt_id作为下一轮的辅助输入
raw_tgt_id.append(new_id)
# print(raw_tgt_id)

manager = IDmanager(vocab_dir = 'datasets/'+L2+'.vocab')

s1 = ' '.join(manager.id2word(raw_tgt_id))
# 把@@这样的提示词都给删了~
s2 = s1.replace('<s> ','').replace('<e> ','').replace('<pad> ','').replace('<unk> ','').replace('@@ ','').replace('<s>','').replace('<e>','').replace('<pad>','').replace('<unk>','').replace('@@','')
# print('raw result:', s1)
# print('result:',s2)
output_file.write(s2+'\n')

提交

1
2
3
4
5
6
7
8
9
10
11
12
13
import os
# directions = ['zh_fr','zh_th','zh_ru','fr_zh','th_zh','ru_zh']
directions = ['th_fr']
for direction in directions:
print(direction)
os.system('python PreProcess.py '+direction)
# train(dircetion = direction)
train(dircetion = direction,learning_rate = 0.001, n_head = 1, num_encoder_layers=1, num_decoder_layers=1, max_epoch = 20,emb_dim=1024, dim_feedforward = 1024)
train(dircetion = direction,learning_rate = 0.00001, n_head = 1, num_encoder_layers=1, num_decoder_layers=1, max_epoch = 20,emb_dim=1024, dim_feedforward = 1024)
# predict(direction = direction)
predict(direction = direction, n_head = 1, num_encoder_layers=1, num_decoder_layers=1,emb_dim=1024, dim_feedforward = 1024)
# 打包提交
! zip trans_result.zip *.rst

复赛TOP3方案分享

第一

image-20221208191626926

数据收集

image-20221208190829400

image-20221208190934978

迭代式检索

image-20221208191042310

平行语料构建

image-20221208191323879

image-20221208191456737

image-20221208191554459

训练方法

image-20221208191654333

对领域的信息是逐渐积累的

image-20221208191844303

继承模型权重,并且加快训练速度,进行了参数的约束

image-20221208192021189

集成

Summary-like:摘要集成,同样数据在不同模型,有较大差异,用摘要过滤数据,可以有弥补短处的效果

Majority-like:17年提出,计算相似度,适用于性能差异不大的模型,因此提出enhanced增强,如果超过某个阈值,就认为模型强度很高,不替换,然后不断迭代模型

image-20221208192232082

模型结果

image-20221208192639033

总结

image-20221208192716375

参赛经验总结

image-20221208192835464

第三

整体流程

image-20221222191137576

数据预处理

image-20221208202112655

image-20221208205334740

image-20221208205347654

image-20221208205352361

image-20221208205357546

数据增强

image-20221208202830743

image-20221208202939503

采用模型

image-20221208203942852

正则化

image-20221208203145159

混合微调策略

image-20221222191804895

集成学习

image-20221208203337991

image-20221208203536059

带*的表示提升比较显著的

image-20221208203841443