Learning_O代码梳理

主函数

创建解释器,添加命令行参数

1
2
3
parser = argparse.ArgumentParser()
parser.add_argument(...)
args = parser.parse_args()

设置随机数种子以及每个任务包含的的标签数量

1
2
set_seed(args)
per_types = args.per_types

持续学习

迭代每个任务

1
for step_id in range(args.start_step, args.nb_tasks)

获取当前任务的标签集,标签数量和 PAD ids。

1
2
3
labels = get_labels_dy(args.labels, per_types, step_id=step_id)
num_labels = len(labels)
pad_token_label_id = CrossEntropyLoss().ignore_index

设置 model_name_or_path

1
2
3
4
5
# 如果是第一轮,则加载 bert-base-uncased 模型
if step_id == 0:
model_name_or_path = "bert-base-uncased"
else: # 否则加载上一轮模型
model_name_or_path = os.path.join(args.output_dir, "task_" + str(step_id - 1))

训练和评价模型

1
train_and_eval(args, labels, num_labels, pad_token_label_id, model_name_or_path,output_dir, data_dir, step_id)

准备数据集

read_examples_from_file()

convert_examples_to_features()

load_and_cache_examples()

从文件中加载样本特征

如果 cached_features_file 存在,则直接从该文件中加载特征

1
features = torch.load(cached_features_file)

否则先处理原始数据,再加载特征

1
2
examples = read_examples_from_file(data_dir, mode)
features = convert_examples_to_features(...)

提取features的属性并构建数据集

1
2
3
4
5
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

返回该数据集

1
return dataset

工具函数

get_exemplar_means()

计算每个类别的原型,即均值向量

1. 将每个样本按标签分类

1
2
3
4
5
# 创建了一个字典,包含了所有可能的类别索引,每个类别对应一个空列表
cls_exemplar = {cls: [] for cls in range(n_tags)} 。
for x, y in zip(support_reps, support_labels):
# 将每个样本按照标签分类存储在cls_exemplar字典中
cls_exemplar[y.item()].append(x)

计算每个类别的原型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
for cls, exemplar in cls_exemplar.items():
features = []
for feature in exemplar:
feature.data = feature.data / feature.data.norm() # Normalize
features.append(feature)
# 如果当前类别下没有样本,则随机初始化一个与样本表示reps大小相同的张量作为该类别的原型
if len(features) == 0:
mu_y = torch.normal(0, 1, size=tuple(x.size())).to(args.device)
mu_y = mu_y.squeeze()
# 如果有,则计算当前类别的所有样本的reps的均值作为原型
else:
features = torch.stack(features)
mu_y = features.mean(0).squeeze()
mu_y.data = mu_y.data / mu_y.data.norm() # Normalize
exemplar_means[cls] = mu_y

返回包含每个类别原型的列表

1
return exemplar_means

get_support_encodings_and_labels(_total)()

获取支持集的encodings和labels

获取train_loader,support_loader,support_o_loader中的encodings和labels

获取train_loader中每个批次的encodings和labels(仅get_support_encodings_and_labels_total)

1
2
3
4
5
6
7
8
9
train_iterator = tqdm(train_loader, desc="Support data representations")
for index, batch in enumerate(train_iterator):
encodings, labels = get_token_encodings_and_labels(args, model, batch)
encodings = encodings.view(-1, encodings.shape[-1])
labels = labels.flatten()
# 过滤掉标签为填充标记的部分
idx = torch.where((labels - pad_token_label_id) != 0)[0]
support_encodings.append(encodings[idx])
support_labels.append(labels[idx])

同样的操作获取support_loader和support_o_loader中每个批次的encodings和labels

1
2
3
4
5
6
support_iterator = tqdm(support_loader, desc="Support data representations")
for index, batch in enumerate(support_iterator):
...
support_o_iterator = tqdm(support_o_loader, desc="Support data representations")
for _, batch in enumerate(support_o_iterator):
...

返回所有的encodings和labels

1
return torch.cat(support_encodings), torch.cat(support_labels)

get_token_logits_and_labels

使用原有的预训练BERT-NER模型获取预测分数和输出标签

1
2
3
4
5
6
7
8
9
with torch.no_grad():
inputs = {"input_ids": batch[0], "attention_mask": batch[1],
"output_hidden_states": True, "mode": "dev"}
if model.config.model_type != "distilbert":
inputs["token_type_ids"] = (batch[2] if model.config.model_type in ["bert", "xlnet"]
else None) # XLM and RoBERTa don"t use token_type_ids
outputs = model(**inputs)
logits = outputs[-1]
return logits, label_batch

get_rehearsal_prototype()

加载支持集以及它们的encodings和labels

1
2
3
4
5
6
7
8
support_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="memory", data_dir=data_dir)
support_o_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="memory_o", data_dir=data_dir)
support_sampler = SequentialSampler(support_dataset)
support_dataloader = DataLoader(support_dataset, sampler=support_sampler, batch_size=args.eval_batch_size)
support_o_dataloader = DataLoader(support_o_dataset, sampler=support_o_sampler, batch_size=args.eval_batch_size)
support_encodings, support_labels = get_support_features_and_labels(args, model, support_dataloader, support_o_dataloader, pad_token_label_id)
# 将support_encodings归一化
support_encodings = F.normalize(support_encodings)

计算类别相似度

1
2
3
4
5
6
7
8
for i in range(1, len(labels)):  # 迭代每个非"O"标签的类别
# 计算类别i的样本之间的余弦相似度
support_reps_dists = torch.matmul(support_encodings[support_labels == i],
support_encodings[support_labels == i].T)
# 将对角线上的元素(样本与自身的相似度)设置为0,以避免将自身视为原型。
support_reps_dists = torch.scatter(support_reps_dists, 1, torch.arange(support_reps_dists.shape[0]).view(-1, 1).to(args.device),0.)
# 计算类别i的类别相似度
prototype_dists.append(support_reps_dists[support_reps_dists > 0].view(-1).mean(-1))

返回类别相似度列表

1
return prototype_dists

定义模型

MySftBertModel()

初始化 init

接受Bert配置参数以及其他自定义参数

1
2
3
4
5
self.per_types = per_types  # 设置每轮任务的类型数量。
self.feat_dim = feat_dim # 设置特征维度。
self.hidden_size = config.hidden_size # 设置隐藏状态的大小。
self.num_labels = config.num_labels # 设置标签数量。
self.bert = BertModel(config, add_pooling_layer=False)

设置了分类器(classifier)和投影头(head),根据mode选择性地设置分类器的输出层。

1
2
3
4
5
6
7
8
9
10
classifier_dropout = (  # 设置分类器的dropout概率
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
if mode == "train": # 根据不同模式设置线性分类器的不同输出维度
if self.num_labels-1 > self.per_types: # 对“O”样本重新标记过
self.classifier = nn.Linear(config.hidden_size, config.num_labels - self.per_types)
else:
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
else:
self.classifier = nn.Linear(config.hidden_size, config.num_labels)

设置不同的head(线性层或多层感知机)

1
2
3
4
5
6
7
8
9
10
if head == 'linear':
self.head = nn.Linear(self.hidden_size, self.hidden_size)
elif head == 'mlp':
self.head = nn.Sequential(
nn.Linear(self.hidden_size, self.hidden_size),
nn.ReLU(inplace=True),
nn.Linear(self.hidden_size, self.feat_dim)
)
else:
raise NotImplementedError('head not supported: {}'.format(head))

前向传播 forward

提取特征

1
2
3
4
5
6
7
8
9
# 调用原有的Bert模型初步提取出样本的特征
outputs = self.bert(...)
features_enc = outputs[0]
# 通过self.head对进一步提取出样本的特征并归一化。
features = F.normalize(self.head(features_enc.view(-1, self.hidden_size)), dim=1)
# 使用初步特征进行预测
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)

如果不是训练模式,直接返回features_enc,features,logits

1
2
3
loss = None
if mode != "train":
return loss, features_enc, features, logits

计算损失函数
如果是第一轮训练

1
2
3
4
5
6
7
8
9
if self.num_labels-1 == self.per_types:  
if loss_name == "supcon": loss = supcon_loss
elif loss_name == "supcon_o": loss = supcon_o_loss
elif loss_name == "supcon_o_ce": loss = supcon_o_loss+ce_loss
elif loss_name == "supcon_o_bce": loss = supcon_o_loss + bce_loss
elif loss_name == "ce": loss = ce_loss
elif loss_name == "bce_o": loss = bce_loss
elif loss_name == "supcon_ce": loss = supcon_loss + ce_loss
elif loss_name == "supcon_bce": loss = supcon_loss + bce_loss

如果不是第一轮训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
elif self.num_labels > self.per_types: 
# 整理新类别标签labels_new,新类别样本的logits:student_new 以及
# 旧类别样本的logits:s_logits,teacher模型的logits:old_logits
labels_new, student_new, s_logits, old_logits = gather_rh_ce( labels, t_logits,
logits, self.num_labels - self.per_types)
if loss_name == "supcon": loss = supcon_loss+kd_loss
elif loss_name == "supcon_nokd": loss = supcon_loss
elif loss_name == "supcon_o": loss = supcon_o_loss+kd_loss
elif loss_name == "supcon_o_ce": loss = supcon_o_loss+ce_loss+kd_loss
elif loss_name == "supcon_o_bce":loss = supcon_o_loss + bce_loss
elif loss_name == "ce": loss = ce_loss+kd_loss
elif loss_name == "bce_o": loss = bce_loss
elif loss_name == "supcon_ce": loss = supcon_loss+ce_loss+kd_loss
elif loss_name == "supcon_bce": loss = supcon_loss+bce_loss

3. 返回 loss, features_enc, features, logits

1
return loss, features_enc, features, logits

训练和评估模型

train_and_eval( )

加载上一轮预训练的参数配置、模型和分词器

1
2
3
4
5
6
# 创建模型配置、模型类别和分词器类
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
# 如果是第一轮,则直接加载 bert-base-uncased 模型
config = config_class.from_pretrained(args.config_name if args.config_name else model_name_or_path,num_labels=num_labels)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else model_name_or_path,do_lower_case=args.do_lower_case)
model = model_class.from_pretrained(model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path),config=config)

获取训练集

1
2
3
4
5
6
# train_dataset=TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_dataset=load_and_cache_examples(args,tokenizer,labels,pad_token_label_id,mode="rehearsal",data_dir=data_dir)
# 顺序采样
train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
# 创建训练数据加载器
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

获取旧模型的特征

teacher_evaluate()

1
2
3
4
5
6
7
8
# 如果当前不是第一个任务,则需要对老师模型进行评估。
if step_id > 0:
t_logits, out_new_labels = teacher_evaluate(args, train_dataloader, model, tokenizer,labels,
pad_token_label_id, mode="train", data_dir=data_dir)
model.new_classifier() # 创建一个新的分类器
else:
t_logits = None
out_new_labels = None

训练模型

train()

1
2
3
4
5
6
7
global_step, tr_loss = train(args, train_dataset, train_dataloader, model, tokenizer, labels,
pad_token_label_id, data_dir=data_dir, output_dir=output_dir,
t_logits=t_logits, out_new_labels=out_new_labels)
# 保存训练过程中得到的模型参数、配置和分词器
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
torch.save(args, os.path.join(output_dir, "training_args.bin"))

在开发集上评估模型

evaluate()

1
2
3
4
5
6
# 对于每个检查点,加载模型并进行评估
for checkpoint in checkpoints:
model = model_class.from_pretrained(checkpoint, mode="dev")
train_dataloader=None
_, result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev",
data_dir=data_dir, prefix=global_step)

在测试集上进行预测

evaluate()

1
2
3
4
5
6
# 加载模型和分词器
tokenizer = tokenizer_class.from_pretrained(output_dir, do_lower_case=args.do_lower_case)
model = model_class.from_pretrained(output_dir, mode="test")
# 调用 evaluate 函数对测试集进行预测,获取macro-F1和micro-F1结果以及预测的标签。
macro_results, micro_results, predictions = evaluate(args, model, tokenizer, labels,
pad_token_label_id, mode="test", data_dir=data_dir)

teacher_evaluate()

根据不同模式设置数据集加载器

如果模式是 “train”,,则使用训练数集

1
2
if mode == "train":
eval_dataloader = train_dataloader

如果模式是 “dev”,则使用开发集

1
2
3
4
elif mode == "dev":
eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode,data_dir=data_dir)
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

评估模型

将模型设置为评估模式

1
model.eval()

使用 get_token_logits_and_labels 函数获取每个batch的预测分数 logits 和输出标签 out_labels

1
2
3
4
5
6
for batch in tqdm(eval_dataloader, desc="Evaluating"):
logits, out_labels = get_token_logits_and_labels(args, model, batch)
# 对评估步骤计数,以便跟踪已评估的批次数量
nb_eval_steps += 1
# 将每个批次的 logits 分数添加到 logits_list 列表中
logits_list.append(logits.detach().cpu())

用原型重新标记阈值重新标记旧实体类

计算原型重新标记阈值和与每个样本的原型相似度最高的实体类别
evaluate()

1
2
3
4
5
6
7
8
9
10
11
12
13
# 计算每个批次样本中原型相似度最大值所在的类别索引 preds
# 每个批次样本与每个类别的原型相似度的最大值 emissions
# 根据原有模型预测的标签索引序列 out_label_ids
# 每个旧类别的原型重新标记阈值列表(还未乘βi)prototype_dists
preds, emissions, out_label_ids, prototype_dists = evaluate(args, model, tokenizer, labels, pad_token_label_id,
mode="rehearsal", data_dir=data_dir)
# 计算原型重新标记阈值 (根据不同的任务步骤i来调整超参数βi)
for i in range(current_task_id):
if args.change_th:
task_para = th_para - (current_task_id - i - 1)*th_reduction # βi
else:
task_para = th_para
prototype_dists[i*args.per_types+1:(i+1)*args.per_types+1] *= task_para

重新标记旧实体类

1
2
3
4
5
6
7
8
for i in range(out_label_ids.shape[0]):  # 迭代每个batch  
for j in range(out_label_ids.shape[1]): # 迭代每个样本
idx = preds[i][j] # 根据原型相似度预测的类别索引
# 如果原型的相似度大于重新标记阈值并且预测的标签是旧实体类的标签
if emissions[i][j] > prototype_dists[idx].item() and out_label_ids[i][j] < len(labels) - args.per_types:
out_label_new_list[i].append(preds[i][j]) # 则将该“O”预测为这个旧实体类
else: # 否则,保持原始的标签不变
out_label_new_list[i].append(out_label_ids[i][j])

返回 logits_list, out_label_new_list

1
return logits_list, out_label_new_list

evaluate ()

读取数据集

读取eval_dataset,support_dataset,support_o_dataset,train_dataset。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode, data_dir=data_dir)
support_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="memory", data_dir=data_dir)
support_o_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="memory_o", data_dir=data_dir)
train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train", data_dir=data_dir)
# 顺序采样
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
support_sampler = SequentialSampler(support_dataset) if args.local_rank == -1 else DistributedSampler(support_dataset)
support_o_sampler = SequentialSampler(support_o_dataset) if args.local_rank == -1 else DistributedSampler(support_o_dataset)
train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
# 数据集加载器
support_dataloader = DataLoader(support_dataset, sampler=support_sampler, batch_size=args.eval_batch_size)
support_o_dataloader = DataLoader(support_o_dataset, sampler=support_o_sampler, batch_size=args.eval_batch_size)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.eval_batch_size)

获取支持数据集的embbedings和labels

1
support_encodings, support_labels = get_support_encodings_and_labels_total(args, model, support_dataloader, support_o_dataloader, train_dataloader, pad_token_label_id)

三种重新标记来自旧类别的“O”的策略

使用原型重新标记

基于“O”样本与原型之间的距离
计算每个类别的原型

1
exemplar_means = get_exemplar_means(args, support_encodings, support_labels)

计算原型重新标记阈值以及“O”与原型的最高相似度

利用 NNClassification() 计算nn_preds,nn_emissions,prototype_dists

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
for _, batch in enumerate(eval_iterator):
batch = tuple(t.to(args.device) for t in batch)
# 循环迭代 eval_iterator,使用原有的模型获取每个批次的encodings和labels
encodings, encoding_labels = get_token_encodings_and_labels(args, model, batch)
# 如果是rehearsal模式,则去除掉当前task样本的support_encodings和support_labels再进行预测。
if mode=="rehearsal":
cls = NNClassification()
support_encodings = support_encodings[support_labels < len(labels) - args.per_types]
support_labels = support_labels[support_labels < len(labels) - args.per_types]
nn_preds(batch_size, sent_len) 包含每个样本中原型相似度最大值所在的类别索引
"""
nn_preds(batch_size, sent_len) 包含每个样本中原型相似度最大值所在的类别索引
nn_emissions(batch_size, sent_len, ndim) 包含每个样本与每个类别的原型相似度的最大值
prototype_dists 每个旧类别的原型重新标记阈值列表(还未乘βi)
"""
nn_preds, nn_emissions, prototype_dists = cls.nn_classifier_dot_prototype(encodings, support_encodings, support_labels, exemplar_means)

使用最近邻重新标记

基于“O”样本与每个类别示例之间的距离

1
2
3
if args.cls_name == "ncm_dot":  
cls = NcmClassification()
nn_preds = cls.ncm_classifier_dot(encodings, support_encodings, support_labels, exemplar_means)

使用原有模型重新标记

作为前两种方法的参考标注

1
2
elif args.cls_name == "linear":
nn_preds, encoding_labels = get_token_logits_and_labels(args, model, batch)

保存预测结果

将每个批次的预测结果追加到 preds 中,并将作为参考标准的预测标签保存到 out_label_ids 中。如果当前模式是rehearsal模式,还会保存emissions。

1
2
3
4
5
6
7
8
9
10
if preds is None: # 第一次预测
preds = nn_preds.detach().cpu().numpy()
out_label_ids = encoding_labels.detach().cpu().numpy()
if mode == "rehearsal":
emissions = nn_emissions.detach().cpu().numpy()
else:
preds = np.append(preds, nn_preds.detach().cpu().numpy(), axis=0)
out_label_ids = np.append(out_label_ids, encoding_labels.detach().cpu().numpy(), axis=0)
if mode == "rehearsal":
emissions = np.append(emissions, nn_emissions.detach().cpu().numpy(), axis=0)

预测结果

如果当前模式是rehearsal模式,那么函数直接将返回 preds、 emissions、out_label_ids 和prototype_dists。

1
2
if mode == "rehearsal":
return preds, emissions, out_label_ids, prototype_dists

如果使用的是线性分类器,根据 preds 得出预测的最大logits?

1
2
if args.cls_name == "linear":
preds = np.argmax(preds, axis=2)

out_label_list 和 preds_list存储使用原有模型和使用自定义方法预测的标签字符串序列

1
2
3
4
5
6
7
8
# 创建 label_map 字典,将标签的索引映射到相应标签的字符串名称。
label_map = {i: "I-"+label for i, label in enumerate(labels)}
label_map[0] = "O"
for i in range(out_label_ids.shape[0]):
for j in range(out_label_ids.shape[1]):
if out_label_ids[i, j] != pad_token_label_id:
out_label_list[i].append(label_map[out_label_ids[i][j]])
preds_list[i].append(label_map[preds[i][j]])

输出评价指标结果。

1
2
3
4
# 使用 seqeval 库计算 F1-score。
metric = load_metric("seqeval")
metric.add_batch(predictions=preds_list, references=out_label_list)
macro_results, micro_results, _ = compute_metrics(metric)

返回评价指标结果以及预测的标签序列。

1
return macro_results, micro_results, preds_list

train()

计算训练总步数t_total和训练轮数num_train_epochs

1
2
3
4
5
6
# 如果设置了训练最大步数max_steps,则t_total = args.max_steps,并计算num_train_epochs,否则根据num_train_epochs计算t_total。
if args.max_steps > 0:
t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

配置优化器

使用AdamW优化器,使用了。权重衰减和学习率调节器。

1
2
3
4
5
6
7
8
no_decay = ["bias", "LayerNorm.weight"]  # 不需要衰减的参数
optimizer_grouped_parameters = [
{"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay},
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

训练

迭代每一轮

1
2
for epoch in train_iterator:
if epoch >= args.start_train_o_epoch:

获取每个类别的类别相似度

1
2
3
prototype_dists = get_rehearsal_prototype(args, model, tokenizer, labels,
pad_token_label_id, mode="rehearsal",
data_dir=data_dir)

获取样本的logits和标签

1
2
3
4
5
6
7
8
9
10
11
12
13
# 按批次遍历训练集中的数据
for step, batch in enumerate(epoch_iterator):
model.train() # 将模型切换到训练模式
if num_labels-1 > args.per_types: # 如果不是第一轮训练
t_logits_step = t_logits[step]
new_labels = out_new_labels[step * args.train_batch_size:step * args.train_batch_size + len(batch[3])]
else: # 如果是第一轮训练,使用训练集的原始标签
t_logits_step = None
new_labels = batch[3]
if epoch >= args.start_train_o_epoch:
loss_name = args.loss_name2 # 实体和“O”的联合损失函数
cls = NNClassification()
encodings, encoding_labels = get_token_features_and_labels(args, model, batch)

计算样本之间的余弦相似度分数

1
2
3
4
5
6
    # top_emissions_step(batch_size*set_len, batch_size*set_len):存储样本之间大于实体阈值的余弦相似度分数
# 选择类别相似度的中位数作为实体阈值 th_dists
top_emissions_step, _ = cls.get_top_emissions_with_th(encodings, encoding_labels,
th_dists=torch.median(prototype_dists).item())
else:
top_emissions_step = top_emissions

用自定义的模型进行训练,获取损失值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
inputs = {"input_ids": batch[0],  
"attention_mask": batch[1],
"token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
# XLM and RoBERTa don"t use segment_ids
"labels": new_labels,
"t_logits": t_logits_step,
"mode": "train",
"loss_name": loss_name,
"top_emissions": top_emissions_step,
"topk_th": True
}
outputs = model(**inputs)
loss = outputs[0]
# 如果设置了梯度累积步数,则需要对损失值进行除以梯度累积步数,以得到平均损失值
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps

更新参数

1
2
3
4
5
6
7
8
9
loss.backward()  # 后向传播
tr_loss += loss.item()
# 如果达到梯度累计步数
if (step + 1) % args.gradient_accumulation_steps == 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
optimizer.step()
scheduler.step()
optimizer.zero_grad() # 清除优化器中所有参数的梯度
global_step += 1 # 更新全局步数

评估

在开发集上评估模型性能

1
2
_, results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev",
data_dir=data_dir)

保存模型及参数

1
2
model_to_save.save_pretrained(output_dir)
torch.save(args, os.path.join(output_dir, "training_args.bin"))

达到最大步数时停止训练

1
2
3
if args.max_steps > 0 and global_step > args.max_steps:  
epoch_iterator.close() # 关闭当前的 epoch 迭代器
break

返回全局步数和平均损失

1
return global_step, tr_loss / global_step  

分类器

NNClassification()

nn_classifier_dot_prototype()

根据原型进行重新标记来自旧实体类别的“O”
计算“O”与每个类别原型的最大相似度

1
2
3
4
5
6
7
8
9
# 将输入的表示 reps 重塑为二维张量,并对其进行归一化处理。
feature = reps.view(-1, reps.shape[-1]) # (batch_size, ndim)
for j in range(feature.size(0)): # Normalize
feature.data[j] = feature.data[j] / feature.data[j].norm()
means = torch.stack([exemplar_means[cls] for cls in range(n_tags)]) # (n_classes, ndim)
dists = torch.matmul(feature, means.T) # (batch_size, n_classes)
dists[:, 0] = torch.zeros(1).to(reps.device) # 将第一列真正的“O” 类别的相似度设为0
# emissions 包含每个样本中原型相似度的最大值,tags 包含每个样本中原型相似度最大值所在的类别索引。
emissions, tags = dists.max(1)

计算每个类别的原型重新标记阈值

1
2
3
4
5
6
7
8
9
10
# 将support_reps 重塑为二维张量,并对其进行归一化处理。
support_reps = support_reps.view(-1, support_reps.shape[-1])
for j in range(support_reps.size(0)): # Normalize
support_reps.data[j] = support_reps.data[j] / support_reps.data[j].norm()
support_reps = F.normalize(support_reps)
for i in range(n_tags):
# 计算每个类别原型与支持集中对应类别的样本的相似度
support_reps_dists = torch.matmul(support_reps[support_tags==i], means[i].T)
# 沿着最后一个维度(即特征维度)寻找最小值,并返回这些最小值以及对应的索引
prototype_dists.append(support_reps_dists.min(-1)[0])

get_top_emissions_with_th()

计算样本之间的余弦相似度

1
2
3
4
5
6
scores = self._euclidean_metric_dot_2(reps.view(-1, ndim), reps.view(-1, ndim), True)
# 排除“O”样本的分数(第二维)
scores = torch.where(reps_labels == 0, scores.double(), -100.)
# 排除样本与自身的分数
scores = torch.scatter(scores, 1,
torch.arange(scores.shape[0]).view(-1, 1).to(device), -100.)

筛选出大于实体阈值的分数

1
top_emissions = scores > th_dists

返回 top_emissions, scores

1
return top_emissions, scores