发布时间:2024-11-05 09:35:44
本内容由, 集智官方收集发布,仅供参考学习,不代表集智官方赞同其观点或证实其内容的真实性准确性,请勿用于商业用途。
以下是一个基于GRU + Transformer 解码器的中越文深度学习翻译模型的代码博客。该模型使用GRU作为编码器,将中文句子转换为隐藏表示,再使用Transformer解码器逐步生成越南文翻译。此方法结合了循环神经网络(RNN)和自注意力机制,兼具Transformer的高效性和GRU的上下文理解能力,适合中越翻译任务。
首先,确保安装了 TensorFlow
和 Pandas
,用于深度学习模型构建和数据处理。
pip install tensorflow pandas sklearn
假设我们有一个 xlsx
文件(例如 translation_dataset.xlsx
),其中包含:id
、越南文翻译
、中文原句
、小说名称
、小说作者
、分词
、章节id
、预置状态
等字段。我们将加载该文件并提取中越文句子,进行数据分词和处理。
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
# 加载数据集
data = pd.read_excel('translation_dataset.xlsx')
# 提取中文和越南文文本
vietnamese_texts = data['越南文翻译'].values
chinese_texts = data['中文原句'].values
# 划分训练集和验证集
train_vietnamese, val_vietnamese, train_chinese, val_chinese = train_test_split(vietnamese_texts, chinese_texts, test_size=0.2, random_state=42)
为了让模型理解文本内容,需要将句子转化为整数序列。我们使用 Tokenizer
为中越文本构建分词器,并将文本转换为序列。
# 构建分词器
def build_tokenizer(texts, max_vocab_size=10000, oov_token='<OOV>'):
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(texts)
return tokenizer
# 构建中越文分词器
vietnamese_tokenizer = build_tokenizer(train_vietnamese)
chinese_tokenizer = build_tokenizer(train_chinese)
# 将文本转换为序列
def tokenize_texts(tokenizer, texts, max_len=50):
sequences = tokenizer.texts_to_sequences(texts)
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len, padding='post')
return sequences
# 将训练集和验证集转换为序列
train_vietnamese_seq = tokenize_texts(vietnamese_tokenizer, train_vietnamese)
val_vietnamese_seq = tokenize_texts(vietnamese_tokenizer, val_vietnamese)
train_chinese_seq = tokenize_texts(chinese_tokenizer, train_chinese)
val_chinese_seq = tokenize_texts(chinese_tokenizer, val_chinese)
# 设置词汇表大小和最大序列长度
vietnamese_vocab_size = len(vietnamese_tokenizer.word_index) + 1
chinese_vocab_size = len(chinese_tokenizer.word_index) + 1
max_seq_len = 50
使用GRU作为编码器,以捕获输入句子的上下文信息,并使用Transformer解码器逐步生成目标语言的翻译。
from tensorflow.keras.layers import Embedding, GRU, Dense, Input, MultiHeadAttention, LayerNormalization, Dropout
from tensorflow.keras.models import Model
# 定义GRU编码器
class Encoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, enc_units):
super(Encoder, self).__init__()
self.embedding = Embedding(vocab_size, embedding_dim)
self.gru = GRU(enc_units, return_sequences=True, return_state=True)
def call(self, x):
x = self.embedding(x)
enc_output, enc_state = self.gru(x)
return enc_output, enc_state
# 定义Transformer解码器
class Decoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, dec_units, num_heads=4, dropout_rate=0.1):
super(Decoder, self).__init__()
self.embedding = Embedding(vocab_size, embedding_dim)
self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
self.dropout = Dropout(dropout_rate)
self.layer_norm1 = LayerNormalization(epsilon=1e-6)
self.layer_norm2 = LayerNormalization(epsilon=1e-6)
self.gru = GRU(dec_units, return_sequences=True, return_state=True)
self.dense = Dense(vocab_size)
def call(self, x, enc_output, dec_state):
x = self.embedding(x)
# 使用多头注意力层,关注编码器的输出
attn_output = self.multi_head_attention(query=x, value=enc_output, key=enc_output)
attn_output = self.dropout(attn_output)
out1 = self.layer_norm1(x + attn_output) # 残差连接 + 归一化
# 使用GRU进一步处理注意力输出
gru_output, dec_state = self.gru(out1, initial_state=dec_state)
gru_output = self.layer_norm2(gru_output + out1) # 残差连接 + 归一化
# 生成最终输出
output = self.dense(gru_output)
return output, dec_state
# 构建翻译模型
embedding_dim = 256
units = 512
encoder = Encoder(chinese_vocab_size, embedding_dim, units)
decoder = Decoder(vietnamese_vocab_size, embedding_dim, units)
# 定义优化器和损失函数
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
# 定义损失函数
def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask # 掩码,忽略填充部分的损失
return tf.reduce_mean(loss_)
# 训练步骤
@tf.function
def train_step(inp, targ, enc_hidden):
loss = 0
with tf.GradientTape() as tape:
# 编码器输出
enc_output, enc_hidden = encoder(inp)
# 解码器初始输入
dec_input = tf.expand_dims([vietnamese_tokenizer.word_index['<start>']] * inp.shape[0], 1)
dec_hidden = enc_hidden
# 逐步生成每个输出单词
for t in range(1, targ.shape[1]):
predictions, dec_hidden = decoder(dec_input, enc_output, dec_hidden)
loss += loss_function(targ[:, t], predictions)
dec_input = tf.expand_dims(targ[:, t], 1)
batch_loss = (loss / int(targ.shape[1]))
variables = encoder.trainable_variables + decoder.trainable_variables
gradients = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
return batch_loss
创建训练数据集,并开始训练模型。
BUFFER_SIZE = len(train_chinese_seq)
batch_size = 64
steps_per_epoch = BUFFER_SIZE // batch_size
train_dataset = tf.data.Dataset.from_tensor_slices((train_chinese_seq, train_vietnamese_seq)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
# 训练模型
EPOCHS = 20
for epoch in range(EPOCHS):
total_loss = 0
for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
batch_loss = train_step(inp, targ, encoder)
total_loss += batch_loss
print(f'Epoch {epoch+1} Loss {total_loss / steps_per_epoch:.4f}')
训练结束后,我们可以测试模型的翻译效果。
def translate(sentence):
sentence_seq = tokenize_texts(chinese_tokenizer, [sentence], max_len=max_seq_len)
enc_output, enc_hidden = encoder(sentence_seq)
dec_input = tf.expand_dims([vietnamese_tokenizer.word_index['<start>']], 0)
dec_hidden = enc_hidden
result = []
for t in range(max_seq_len):
predictions, dec_hidden = decoder(dec_input, enc_output, dec_hidden)
predicted_id = tf.argmax(predictions[0]).numpy()
result.append(vietnamese_tokenizer.index_word.get(predicted_id, '<unk>'))
if vietnamese_tokenizer.index_word.get(predicted_id) == '<end>':
break
dec_input = tf.expand_dims([predicted_id], 0)
return ' '.join(result)
# 测试翻译
test_sentence = "这是测试句子。" # 假设这是中文输入
print("翻译结果:", translate(test_sentence))
通过以上步骤,你可以实现一个基于GRU和Transformer解码器的中越翻译模型,在实际应用中适用于多语言翻译任务和NLP研究。
这类数据集包含成对或多对语言的文本样本,每一对文本表示相同内容的不同语言版本。目的是训练机器翻译模型,使其能够将一种语言的文本准确地翻译成另一种语言。用于开发和优化自动翻译系统,提高跨语言沟通的效率和准确性。