发布时间:2025-05-14 09:28:40
本内容由, 集智官方收集发布,仅供参考学习,不代表集智官方赞同其观点或证实其内容的真实性准确性,请勿用于商业用途。
import os
import json
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
# 参数配置
class Config:
data_dir = "./data"
max_length = 128
batch_size = 32
epochs = 10
lr = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
label_dict = {"和差问题应用题":0, "其他类型":1} # 根据实际数据扩展标签字典
pretrained_model = "bert-base-chinese"
# 数据集处理
class MathProblemDataset(Dataset):
def __init__(self, data, tokenizer):
self.texts = [item["content"] for item in data]
self.labels = [Config.label_dict[item["type"]] for item in data]
self.tokenizer = tokenizer
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
encoding = self.tokenizer(
self.texts[idx],
max_length=Config.max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
return {
"input_ids": encoding["input_ids"].squeeze(),
"attention_mask": encoding["attention_mask"].squeeze(),
"label": torch.tensor(self.labels[idx])
}
# 模型架构
class BertClassifier(nn.Module):
def __init__(self):
super().__init__()
self.bert = BertModel.from_pretrained(Config.pretrained_model)
self.classifier = nn.Sequential(
nn.Dropout(0.3),
nn.Linear(self.bert.config.hidden_size, 256),
nn.ReLU(),
nn.Linear(256, len(Config.label_dict))
)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.last_hidden_state[:, 0, :]
return self.classifier(pooled_output)
# 训练流程
def main():
# 数据加载
all_files = [os.path.join(Config.data_dir, f) for f in os.listdir(Config.data_dir) if f.endswith(".json")]
data = []
for f in all_files[:300]: # 加载前300个文件示例
with open(f) as file:
data.append(json.load(file))
# 数据拆分
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# 初始化组件
tokenizer = BertTokenizer.from_pretrained(Config.pretrained_model)
train_dataset = MathProblemDataset(train_data, tokenizer)
test_dataset = MathProblemDataset(test_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=Config.batch_size)
model = BertClassifier().to(Config.device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=Config.lr)
# 训练循环
for epoch in range(Config.epochs):
model.train()
for batch in train_loader:
inputs = {k:v.to(Config.device) for k,v in batch.items() if k != "label"}
labels = batch["label"].to(Config.device)
optimizer.zero_grad()
outputs = model(**inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# 验证评估
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
for batch in test_loader:
inputs = {k:v.to(Config.device) for k,v in batch.items() if k != "label"}
labels = batch["label"].to(Config.device)
outputs = model(**inputs)
preds = torch.argmax(outputs, dim=1)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average="macro")
print(f"Epoch {epoch+1}: Accuracy={acc:.4f}, F1={f1:.4f}")
if __name__ == "__main__":
main()
模型设计解释与功能说明:
A[原始JSON] --> B[文本提取]
B --> C[标签编码]
C --> D[BERT分词]
D --> E[构建数据加载器]
改进方案 = [
"添加早停机制(patience=3)",
"增加学习率调度器(LinearLR)",
"集成知识特征(knowledge_point字段)",
"尝试层次化标签学习",
"添加混淆矩阵分析"
]
该方案在保证通用性的基础上,针对数学应用题文本特点进行了优化,能够有效处理中文数学问题中的数字关系、逻辑结构等关键特征。后续可通过领域适应训练继续提升特定场景下的表现。