NER NER Learning_O代码梳理 Alive~o.0 2024-03-18 2024-07-24 主函数
创建解释器,添加命令行参数
1 2 3 parser = argparse.ArgumentParser() parser.add_argument(...) args = parser.parse_args()
设置随机数种子以及每个任务包含的的标签数量
1 2 set_seed(args) per_types = args.per_types
持续学习
迭代每个任务
1 for step_id in range (args.start_step, args.nb_tasks)
获取当前任务的标签集,标签数量和 PAD ids。
1 2 3 labels = get_labels_dy(args.labels, per_types, step_id=step_id) num_labels = len (labels) pad_token_label_id = CrossEntropyLoss().ignore_index
设置 model_name_or_path
1 2 3 4 5 if step_id == 0 : model_name_or_path = "bert-base-uncased" else : model_name_or_path = os.path.join(args.output_dir, "task_" + str (step_id - 1 ))
训练和评价模型
1 train_and_eval(args, labels, num_labels, pad_token_label_id, model_name_or_path,output_dir, data_dir, step_id)
准备数据集
read_examples_from_file()
convert_examples_to_features()
load_and_cache_examples()
从文件中加载样本特征
如果 cached_features_file 存在,则直接从该文件中加载特征
1 features = torch.load(cached_features_file)
否则先处理原始数据,再加载特征
1 2 examples = read_examples_from_file(data_dir, mode) features = convert_examples_to_features(...)
提取features的属性并构建数据集
1 2 3 4 5 all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
返回该数据集
工具函数
get_exemplar_means()
计算每个类别的原型,即均值向量
1. 将每个样本按标签分类
1 2 3 4 5 cls_exemplar = {cls: [] for cls in range (n_tags)} 。 for x, y in zip (support_reps, support_labels): cls_exemplar[y.item()].append(x)
计算每个类别的原型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 for cls, exemplar in cls_exemplar.items(): features = [] for feature in exemplar: feature.data = feature.data / feature.data.norm() features.append(feature) if len (features) == 0 : mu_y = torch.normal(0 , 1 , size=tuple (x.size())).to(args.device) mu_y = mu_y.squeeze() else : features = torch.stack(features) mu_y = features.mean(0 ).squeeze() mu_y.data = mu_y.data / mu_y.data.norm() exemplar_means[cls] = mu_y
返回包含每个类别原型的列表
get_support_encodings_and_labels(_total)()
获取支持集的encodings和labels
获取train_loader,support_loader,support_o_loader中的encodings和labels
获取train_loader中每个批次的encodings和labels(仅get_support_encodings_and_labels_total)
1 2 3 4 5 6 7 8 9 train_iterator = tqdm(train_loader, desc="Support data representations" ) for index, batch in enumerate (train_iterator): encodings, labels = get_token_encodings_and_labels(args, model, batch) encodings = encodings.view(-1 , encodings.shape[-1 ]) labels = labels.flatten() idx = torch.where((labels - pad_token_label_id) != 0 )[0 ] support_encodings.append(encodings[idx]) support_labels.append(labels[idx])
同样的操作获取support_loader和support_o_loader中每个批次的encodings和labels
1 2 3 4 5 6 support_iterator = tqdm(support_loader, desc="Support data representations" ) for index, batch in enumerate (support_iterator): ... support_o_iterator = tqdm(support_o_loader, desc="Support data representations" ) for _, batch in enumerate (support_o_iterator): ...
返回所有的encodings和labels
1 return torch.cat(support_encodings), torch.cat(support_labels)
get_token_logits_and_labels
使用原有的预训练BERT-NER模型获取预测分数和输出标签
1 2 3 4 5 6 7 8 9 with torch.no_grad(): inputs = {"input_ids" : batch[0 ], "attention_mask" : batch[1 ], "output_hidden_states" : True , "mode" : "dev" } if model.config.model_type != "distilbert" : inputs["token_type_ids" ] = (batch[2 ] if model.config.model_type in ["bert" , "xlnet" ] else None ) outputs = model(**inputs) logits = outputs[-1 ] return logits, label_batch
get_rehearsal_prototype()
加载支持集以及它们的encodings和labels
1 2 3 4 5 6 7 8 support_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="memory" , data_dir=data_dir) support_o_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="memory_o" , data_dir=data_dir) support_sampler = SequentialSampler(support_dataset) support_dataloader = DataLoader(support_dataset, sampler=support_sampler, batch_size=args.eval_batch_size) support_o_dataloader = DataLoader(support_o_dataset, sampler=support_o_sampler, batch_size=args.eval_batch_size) support_encodings, support_labels = get_support_features_and_labels(args, model, support_dataloader, support_o_dataloader, pad_token_label_id) support_encodings = F.normalize(support_encodings)
计算类别相似度
1 2 3 4 5 6 7 8 for i in range (1 , len (labels)): support_reps_dists = torch.matmul(support_encodings[support_labels == i], support_encodings[support_labels == i].T) support_reps_dists = torch.scatter(support_reps_dists, 1 , torch.arange(support_reps_dists.shape[0 ]).view(-1 , 1 ).to(args.device),0. ) prototype_dists.append(support_reps_dists[support_reps_dists > 0 ].view(-1 ).mean(-1 ))
返回类别相似度列表
定义模型
MySftBertModel()
初始化 init
接受Bert配置参数以及其他自定义参数
1 2 3 4 5 self.per_types = per_types self.feat_dim = feat_dim self.hidden_size = config.hidden_size self.num_labels = config.num_labels self.bert = BertModel(config, add_pooling_layer=False )
设置了分类器(classifier)和投影头(head),根据mode选择性地设置分类器的输出层。
1 2 3 4 5 6 7 8 9 10 classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) if mode == "train" : if self.num_labels-1 > self.per_types: self.classifier = nn.Linear(config.hidden_size, config.num_labels - self.per_types) else : self.classifier = nn.Linear(config.hidden_size, config.num_labels) else : self.classifier = nn.Linear(config.hidden_size, config.num_labels)
设置不同的head(线性层或多层感知机)
1 2 3 4 5 6 7 8 9 10 if head == 'linear' : self.head = nn.Linear(self.hidden_size, self.hidden_size) elif head == 'mlp' : self.head = nn.Sequential( nn.Linear(self.hidden_size, self.hidden_size), nn.ReLU(inplace=True ), nn.Linear(self.hidden_size, self.feat_dim) ) else : raise NotImplementedError('head not supported: {}' .format (head))
前向传播 forward
提取特征
1 2 3 4 5 6 7 8 9 outputs = self.bert(...) features_enc = outputs[0 ] features = F.normalize(self.head(features_enc.view(-1 , self.hidden_size)), dim=1 ) sequence_output = outputs[0 ] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output)
如果不是训练模式,直接返回features_enc,features,logits
1 2 3 loss = None if mode != "train" : return loss, features_enc, features, logits
计算损失函数
如果是第一轮训练
1 2 3 4 5 6 7 8 9 if self.num_labels-1 == self.per_types: if loss_name == "supcon" : loss = supcon_loss elif loss_name == "supcon_o" : loss = supcon_o_loss elif loss_name == "supcon_o_ce" : loss = supcon_o_loss+ce_loss elif loss_name == "supcon_o_bce" : loss = supcon_o_loss + bce_loss elif loss_name == "ce" : loss = ce_loss elif loss_name == "bce_o" : loss = bce_loss elif loss_name == "supcon_ce" : loss = supcon_loss + ce_loss elif loss_name == "supcon_bce" : loss = supcon_loss + bce_loss
如果不是第一轮训练
1 2 3 4 5 6 7 8 9 10 11 12 13 14 elif self.num_labels > self.per_types: labels_new, student_new, s_logits, old_logits = gather_rh_ce( labels, t_logits, logits, self.num_labels - self.per_types) if loss_name == "supcon" : loss = supcon_loss+kd_loss elif loss_name == "supcon_nokd" : loss = supcon_loss elif loss_name == "supcon_o" : loss = supcon_o_loss+kd_loss elif loss_name == "supcon_o_ce" : loss = supcon_o_loss+ce_loss+kd_loss elif loss_name == "supcon_o_bce" :loss = supcon_o_loss + bce_loss elif loss_name == "ce" : loss = ce_loss+kd_loss elif loss_name == "bce_o" : loss = bce_loss elif loss_name == "supcon_ce" : loss = supcon_loss+ce_loss+kd_loss elif loss_name == "supcon_bce" : loss = supcon_loss+bce_loss
3. 返回 loss, features_enc, features, logits
1 return loss, features_enc, features, logits
训练和评估模型
train_and_eval( )
加载上一轮预训练的参数配置、模型和分词器
1 2 3 4 5 6 config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else model_name_or_path,num_labels=num_labels) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else model_name_or_path,do_lower_case=args.do_lower_case) model = model_class.from_pretrained(model_name_or_path, from_tf=bool (".ckpt" in model_name_or_path),config=config)
获取训练集
1 2 3 4 5 6 train_dataset=load_and_cache_examples(args,tokenizer,labels,pad_token_label_id,mode="rehearsal" ,data_dir=data_dir) train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
获取旧模型的特征
teacher_evaluate()
1 2 3 4 5 6 7 8 if step_id > 0 : t_logits, out_new_labels = teacher_evaluate(args, train_dataloader, model, tokenizer,labels, pad_token_label_id, mode="train" , data_dir=data_dir) model.new_classifier() else : t_logits = None out_new_labels = None
训练模型
train()
1 2 3 4 5 6 7 global_step, tr_loss = train(args, train_dataset, train_dataloader, model, tokenizer, labels, pad_token_label_id, data_dir=data_dir, output_dir=output_dir, t_logits=t_logits, out_new_labels=out_new_labels) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin" ))
在开发集上评估模型
evaluate()
1 2 3 4 5 6 for checkpoint in checkpoints: model = model_class.from_pretrained(checkpoint, mode="dev" ) train_dataloader=None _, result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev" , data_dir=data_dir, prefix=global_step)
在测试集上进行预测
evaluate()
1 2 3 4 5 6 tokenizer = tokenizer_class.from_pretrained(output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(output_dir, mode="test" ) macro_results, micro_results, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test" , data_dir=data_dir)
teacher_evaluate()
根据不同模式设置数据集加载器
如果模式是 “train”,,则使用训练数集
1 2 if mode == "train" : eval_dataloader = train_dataloader
如果模式是 “dev”,则使用开发集
1 2 3 4 elif mode == "dev" : eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode,data_dir=data_dir) eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
评估模型
将模型设置为评估模式
使用 get_token_logits_and_labels 函数获取每个batch的预测分数 logits 和输出标签 out_labels
1 2 3 4 5 6 for batch in tqdm(eval_dataloader, desc="Evaluating" ): logits, out_labels = get_token_logits_and_labels(args, model, batch) nb_eval_steps += 1 logits_list.append(logits.detach().cpu())
用原型重新标记阈值重新标记旧实体类
计算原型重新标记阈值和与每个样本的原型相似度最高的实体类别
evaluate()
1 2 3 4 5 6 7 8 9 10 11 12 13 preds, emissions, out_label_ids, prototype_dists = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="rehearsal" , data_dir=data_dir) for i in range (current_task_id): if args.change_th: task_para = th_para - (current_task_id - i - 1 )*th_reduction else : task_para = th_para prototype_dists[i*args.per_types+1 :(i+1 )*args.per_types+1 ] *= task_para
重新标记旧实体类
1 2 3 4 5 6 7 8 for i in range (out_label_ids.shape[0 ]): for j in range (out_label_ids.shape[1 ]): idx = preds[i][j] if emissions[i][j] > prototype_dists[idx].item() and out_label_ids[i][j] < len (labels) - args.per_types: out_label_new_list[i].append(preds[i][j]) else : out_label_new_list[i].append(out_label_ids[i][j])
返回 logits_list, out_label_new_list
1 return logits_list, out_label_new_list
evaluate ()
读取数据集
读取eval_dataset,support_dataset,support_o_dataset,train_dataset。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode, data_dir=data_dir) support_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="memory" , data_dir=data_dir) support_o_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="memory_o" , data_dir=data_dir) train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train" , data_dir=data_dir) eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) support_sampler = SequentialSampler(support_dataset) if args.local_rank == -1 else DistributedSampler(support_dataset) support_o_sampler = SequentialSampler(support_o_dataset) if args.local_rank == -1 else DistributedSampler(support_o_dataset) train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) support_dataloader = DataLoader(support_dataset, sampler=support_sampler, batch_size=args.eval_batch_size) support_o_dataloader = DataLoader(support_o_dataset, sampler=support_o_sampler, batch_size=args.eval_batch_size) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.eval_batch_size)
获取支持数据集的embbedings和labels
1 support_encodings, support_labels = get_support_encodings_and_labels_total(args, model, support_dataloader, support_o_dataloader, train_dataloader, pad_token_label_id)
三种重新标记来自旧类别的“O”的策略
使用原型重新标记
基于“O”样本与原型之间的距离
计算每个类别的原型
1 exemplar_means = get_exemplar_means(args, support_encodings, support_labels)
计算原型重新标记阈值以及“O”与原型的最高相似度
利用 NNClassification() 计算nn_preds,nn_emissions,prototype_dists
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 for _, batch in enumerate (eval_iterator): batch = tuple (t.to(args.device) for t in batch) encodings, encoding_labels = get_token_encodings_and_labels(args, model, batch) if mode=="rehearsal" : cls = NNClassification() support_encodings = support_encodings[support_labels < len (labels) - args.per_types] support_labels = support_labels[support_labels < len (labels) - args.per_types] nn_preds(batch_size, sent_len) 包含每个样本中原型相似度最大值所在的类别索引 """ nn_preds(batch_size, sent_len) 包含每个样本中原型相似度最大值所在的类别索引 nn_emissions(batch_size, sent_len, ndim) 包含每个样本与每个类别的原型相似度的最大值 prototype_dists 每个旧类别的原型重新标记阈值列表(还未乘βi) """ nn_preds, nn_emissions, prototype_dists = cls.nn_classifier_dot_prototype(encodings, support_encodings, support_labels, exemplar_means)
使用最近邻重新标记
基于“O”样本与每个类别示例之间的距离
1 2 3 if args.cls_name == "ncm_dot" : cls = NcmClassification() nn_preds = cls.ncm_classifier_dot(encodings, support_encodings, support_labels, exemplar_means)
使用原有模型重新标记
作为前两种方法的参考标注
1 2 elif args.cls_name == "linear" : nn_preds, encoding_labels = get_token_logits_and_labels(args, model, batch)
保存预测结果
将每个批次的预测结果追加到 preds 中,并将作为参考标准的预测标签保存到 out_label_ids 中。如果当前模式是rehearsal模式,还会保存emissions。
1 2 3 4 5 6 7 8 9 10 if preds is None : preds = nn_preds.detach().cpu().numpy() out_label_ids = encoding_labels.detach().cpu().numpy() if mode == "rehearsal" : emissions = nn_emissions.detach().cpu().numpy() else : preds = np.append(preds, nn_preds.detach().cpu().numpy(), axis=0 ) out_label_ids = np.append(out_label_ids, encoding_labels.detach().cpu().numpy(), axis=0 ) if mode == "rehearsal" : emissions = np.append(emissions, nn_emissions.detach().cpu().numpy(), axis=0 )
预测结果
如果当前模式是rehearsal模式,那么函数直接将返回 preds、 emissions、out_label_ids 和prototype_dists。
1 2 if mode == "rehearsal" : return preds, emissions, out_label_ids, prototype_dists
如果使用的是线性分类器,根据 preds 得出预测的最大logits?
1 2 if args.cls_name == "linear" : preds = np.argmax(preds, axis=2 )
out_label_list 和 preds_list存储使用原有模型和使用自定义方法预测的标签字符串序列
1 2 3 4 5 6 7 8 label_map = {i: "I-" +label for i, label in enumerate (labels)} label_map[0 ] = "O" for i in range (out_label_ids.shape[0 ]): for j in range (out_label_ids.shape[1 ]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]])
输出评价指标结果。
1 2 3 4 metric = load_metric("seqeval" ) metric.add_batch(predictions=preds_list, references=out_label_list) macro_results, micro_results, _ = compute_metrics(metric)
返回评价指标结果以及预测的标签序列。
1 return macro_results, micro_results, preds_list
train()
计算训练总步数t_total和训练轮数num_train_epochs
1 2 3 4 5 6 if args.max_steps > 0 : t_total = args.max_steps args.num_train_epochs = args.max_steps // (len (train_dataloader) // args.gradient_accumulation_steps) + 1 else : t_total = len (train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
配置优化器
使用AdamW优化器,使用了。权重衰减和学习率调节器。
1 2 3 4 5 6 7 8 no_decay = ["bias" , "LayerNorm.weight" ] optimizer_grouped_parameters = [ {"params" : [p for n, p in model.named_parameters() if not any (nd in n for nd in no_decay)], "weight_decay" : args.weight_decay}, {"params" : [p for n, p in model.named_parameters() if any (nd in n for nd in no_decay)], "weight_decay" : 0.0 } ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
训练
迭代每一轮
1 2 for epoch in train_iterator: if epoch >= args.start_train_o_epoch:
获取每个类别的类别相似度
1 2 3 prototype_dists = get_rehearsal_prototype(args, model, tokenizer, labels, pad_token_label_id, mode="rehearsal" , data_dir=data_dir)
获取样本的logits和标签
1 2 3 4 5 6 7 8 9 10 11 12 13 for step, batch in enumerate (epoch_iterator): model.train() if num_labels-1 > args.per_types: t_logits_step = t_logits[step] new_labels = out_new_labels[step * args.train_batch_size:step * args.train_batch_size + len (batch[3 ])] else : t_logits_step = None new_labels = batch[3 ] if epoch >= args.start_train_o_epoch: loss_name = args.loss_name2 cls = NNClassification() encodings, encoding_labels = get_token_features_and_labels(args, model, batch)
计算样本之间的余弦相似度分数
1 2 3 4 5 6 # top_emissions_step(batch_size*set_len, batch_size*set_len):存储样本之间大于实体阈值的余弦相似度分数 # 选择类别相似度的中位数作为实体阈值 th_dists top_emissions_step, _ = cls.get_top_emissions_with_th(encodings, encoding_labels, th_dists=torch.median(prototype_dists).item()) else: top_emissions_step = top_emissions
用自定义的模型进行训练,获取损失值
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 inputs = {"input_ids" : batch[0 ], "attention_mask" : batch[1 ], "token_type_ids" : batch[2 ] if args.model_type in ["bert" , "xlnet" ] else None , "labels" : new_labels, "t_logits" : t_logits_step, "mode" : "train" , "loss_name" : loss_name, "top_emissions" : top_emissions_step, "topk_th" : True } outputs = model(**inputs) loss = outputs[0 ] if args.gradient_accumulation_steps > 1 : loss = loss / args.gradient_accumulation_steps
更新参数
1 2 3 4 5 6 7 8 9 loss.backward() tr_loss += loss.item() if (step + 1 ) % args.gradient_accumulation_steps == 0 : torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1
评估
在开发集上评估模型性能
1 2 _, results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev" , data_dir=data_dir)
保存模型及参数
1 2 model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin" ))
达到最大步数时停止训练
1 2 3 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() # 关闭当前的 epoch 迭代器 break
返回全局步数和平均损失
1 return global_step, tr_loss / global_step
分类器
NNClassification()
nn_classifier_dot_prototype()
根据原型进行重新标记来自旧实体类别的“O”
计算“O”与每个类别原型的最大相似度
1 2 3 4 5 6 7 8 9 feature = reps.view(-1 , reps.shape[-1 ]) for j in range (feature.size(0 )): feature.data[j] = feature.data[j] / feature.data[j].norm() means = torch.stack([exemplar_means[cls] for cls in range (n_tags)]) dists = torch.matmul(feature, means.T) dists[:, 0 ] = torch.zeros(1 ).to(reps.device) emissions, tags = dists.max (1 )
计算每个类别的原型重新标记阈值
1 2 3 4 5 6 7 8 9 10 support_reps = support_reps.view(-1 , support_reps.shape[-1 ]) for j in range (support_reps.size(0 )): support_reps.data[j] = support_reps.data[j] / support_reps.data[j].norm() support_reps = F.normalize(support_reps) for i in range (n_tags): support_reps_dists = torch.matmul(support_reps[support_tags==i], means[i].T) prototype_dists.append(support_reps_dists.min (-1 )[0 ])
get_top_emissions_with_th()
计算样本之间的余弦相似度
1 2 3 4 5 6 scores = self._euclidean_metric_dot_2(reps.view(-1 , ndim), reps.view(-1 , ndim), True ) scores = torch.where(reps_labels == 0 , scores.double(), -100. ) scores = torch.scatter(scores, 1 , torch.arange(scores.shape[0 ]).view(-1 , 1 ).to(device), -100. )
筛选出大于实体阈值的分数
1 top_emissions = scores > th_dists
返回 top_emissions, scores
1 return top_emissions, scores