1 Star 1 Fork 0

sparkle_code_guy / bert_related_task

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
4_bert_sentence_similarity.py 7.87 KB
一键复制 编辑 原始数据 按行查看 历史
import tensorflow as tf
import numpy as np
from transformers import TFBertPreTrainedModel, BertConfig, TFBertMainLayer, BertTokenizer
from transformers.modeling_tf_outputs import TFSemanticSegmenterOutput
from typing import Optional, Tuple, Union
import pandas as pd
from transformers.models.bert.modeling_tf_bert import (
TFModelInputType,
TFSequenceClassificationLoss,
unpack_inputs, BERT_INPUTS_DOCSTRING
)
from transformers.utils import add_start_docstrings_to_model_forward
# tensorflow版本的bert模型实在原有transformer的encoder结构上增加了一个pooler层,其处理只是将encoder的最后一层hidden state的第一个token的编码进行dense+tanh处理,相关代码:TFBertPooler
# 本方法仅用于学习原理,并了解如何对基于bert上下游任务调优和修改,后面基于bert预训练模型的特殊自定义下游任务可以基于此进行修改调整
#本文只是实现了一个简单的无监督训练,仅供学习
#https://huggingface.co/uer/sbert-base-chinese-nli
# 参考:https://github.com/jifei/simcse-tf2.git
# 原作者的代码:https://github.com/princeton-nlp/SimCSE#model-list,开源的simcse默认代码仅适用于英文,中文部分,需要通过替换bert预训练模型后更换中文训练集
#相关STS数据下载:链接:https://pan.baidu.com/s/1JzzDVjaBRrDjYGgPJ6D4hQ?pwd=cxa6 提取码:cxa6
class TFSimCSE(TFBertPreTrainedModel, TFSequenceClassificationLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
_keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config: BertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert")
@unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def call(
self,
input_ids: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = None,
labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
training: Optional[bool] = False,
) -> Union[TFSemanticSegmenterOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs1 = self.bert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 说明,transformer相关模型返回的结果必须是tuple,dict(ModelOutput对象类似)
return TFSemanticSegmenterOutput(logits=outputs1.pooler_output, loss=None)
def serving_output(self, output: dict) -> dict:
return output
def simcse_loss(y_true, y_pred):
"""
simcse loss
对应的介绍见文章:https://blog.csdn.net/sslfk/article/details/123210756
"""
idxs = tf.range(0, tf.shape(y_pred)[0])
idxs_1 = idxs[None, :]
idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
y_true = tf.equal(idxs_1, idxs_2)
y_true = tf.cast(y_true, tf.keras.backend.floatx())
y_pred = tf.math.l2_normalize(y_pred, axis=1)
similarities = tf.matmul(y_pred, y_pred, transpose_b=True)
similarities = similarities - tf.eye(tf.shape(y_pred)[0]) * 1e12
similarities = similarities / 0.05
loss = tf.keras.losses.categorical_crossentropy(y_true, similarities, from_logits=True)
return tf.reduce_mean(loss)
def simcse_hard_neg_loss(y_true, y_pred):
"""
simcse loss for hard neg or random neg
"""
row = tf.range(0, tf.shape(y_pred)[0], 3)
col = tf.range(tf.shape(y_pred)[0])
col = tf.squeeze(tf.where(col % 3 != 0), axis=1)
y_true = tf.range(0, len(col), 2)
y_pred = tf.math.l2_normalize(y_pred, axis=1)
similarities = tf.matmul(y_pred, y_pred, transpose_b=True)
similarities = tf.gather(similarities, row, axis=0)
similarities = tf.gather(similarities, col, axis=1)
similarities = similarities / 0.05
loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, similarities, from_logits=True)
return tf.reduce_mean(loss)
max_length = 60
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
def simcse_generater():
df_raw = pd.read_csv("data/sts_data/senteval_cn/ATEC/ATEC.train.data", sep="\t", header=None,
names=["x1", "x2", "y"])
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
return {
"input_ids": input_ids,
"token_type_ids": token_type_ids,
"attention_mask": attention_masks,
}, label
def encode_examples(ds, limit=-1):
# prepare list, so that we can build up final TensorFlow dataset from slices.
input_ids_list = []
token_type_ids_list = []
attention_mask_list = []
label_list = []
if (limit > 0):
ds = ds.take(limit)
for index, row in ds.iterrows():
x1 = row["x1"]
x2 = row["x2"]
for each in (x1, x2):
bert_input = tokenizer.encode_plus(each,
add_special_tokens=True, # add [CLS], [SEP]
padding='max_length',
truncation=True,
max_length=max_length, # max length of the text that can go to BERT
# pad_to_max_length=True,
return_attention_mask=True,
# add attention mask to not focus on pad tokens
)
input_ids_list.append(bert_input['input_ids'])
token_type_ids_list.append(bert_input['token_type_ids'])
attention_mask_list.append(bert_input['attention_mask'])
label_list.append([0])
return tf.data.Dataset.from_tensor_slices(
(input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)
# train dataset
batch_size = 100
ds_train_encoded = encode_examples(df_raw).shuffle(10000).batch(batch_size)
return ds_train_encoded
learning_rate = 2e-5
my_model = TFSimCSE.from_pretrained('bert-base-chinese')
# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08, clipnorm=1)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
my_model.compile(optimizer=optimizer, loss=simcse_loss)
# fit model
bert_history = my_model.fit(simcse_generater(), epochs=1)
# evaluate test set
tf.keras.models.save_model(my_model, filepath="my_model1")
Python
1
https://gitee.com/sparkle__code__guy/bert_related_task.git
git@gitee.com:sparkle__code__guy/bert_related_task.git
sparkle__code__guy
bert_related_task
bert_related_task
master

搜索帮助