一、出處
https://www.sbert.net/examples/training/sts/README.html
https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark.py
二、代碼
此示例從頭開始為 STSbenchmark 訓練 BERT(或任何其他轉換器模型,如 RoBERTa、DistilBERT 等)。 它生成句子嵌入,可以使用余弦相似度進行比較以測量相似度。
用法:
python training_nli.py 或者 python training_nli.py pretrained_transformer_model_name
from torch.utils.data import DataLoader import math from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.readers import InputExample import logging from datetime import datetime import sys import os import gzip import csv #### Just some code to print debug information to stdout
只是一些將調試信息打印到標准輸出的代碼
logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout 調試信息打印到標准輸出 #Check if dataset exsist. If not, download and extract it 檢查數據集是否存在。 如果沒有,請下載並解壓 sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
您可以在此處指定任何 Huggingface/transformers 預訓練模型,例如,bert-base-uncased、roberta-base、xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else 'distilbert-base-uncased' # Read the dataset 讀取數據集 train_batch_size = 16 num_epochs = 4 model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
使用 Huggingface/transformers 模型(如 BERT、RoBERTa、XLNet、XLM-R)將令牌映射到嵌入
word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector 應用平均池化得到一個固定大小的句子向量 pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training 將數據集轉換為准備訓練的 DataLoader logging.info("Read STSbenchmark train dataset") train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': dev_samples.append(inp_example) elif row['split'] == 'test': test_samples.append(inp_example) else: train_samples.append(inp_example) train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training. We skip evaluation in this example 配置訓練。 我們在這個例子中跳過評估 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset 加載存儲的模型並評估其在 STS 基准數據集上的性能 # ############################################################################## model = SentenceTransformer(model_save_path) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') test_evaluator(model, output_path=model_save_path)
三、評估
""" This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset
此示例加載預訓練模型並在 STSbenchmark 數據集上對其進行評估 Usage: python evaluation_stsbenchmark.py OR python evaluation_stsbenchmark.py model_name """ from sentence_transformers import SentenceTransformer, util, LoggingHandler, InputExample from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator import logging import sys import torch import gzip import os import csv script_folder_path = os.path.dirname(os.path.realpath(__file__)) #Limit torch to 4 threads 將割炬限制為 4 個線程 torch.set_num_threads(4) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout model_name = sys.argv[1] if len(sys.argv) > 1 else 'stsb-distilroberta-base-v2' # Load a named sentence model (based on BERT). This will download the model from our server. # Alternatively, you can also pass a filepath to SentenceTransformer()
加載命名句子模型(基於 BERT)。 這將從我們的服務器下載模型。 或者,您也可以將文件路徑傳遞給 SentenceTransformer()
model = SentenceTransformer(model_name) sts_dataset_path = 'data/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': dev_samples.append(inp_example) elif row['split'] == 'test': test_samples.append(inp_example) else: train_samples.append(inp_example) evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') model.evaluate(evaluator) evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') model.evaluate(evaluator)
四、繼續訓練
""" This example loads the pre-trained SentenceTransformer model 'nli-distilroberta-base-v2' from the server. It then fine-tunes this model for some epochs on the STS benchmark dataset. Note: In this example, you must specify a SentenceTransformer model. If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
此示例從服務器加載預訓練的 SentenceTransformer 模型“nli-distilroberta-base-v2”。
然后,它針對 STS 基准數據集上的某些時期對該模型進行微調。
注意:在此示例中,您必須指定 SentenceTransformer 模型。
如果你想微調像 bert-base-uncased 這樣的擁抱臉/變形金剛模型,請參閱 training_nli.py 和 training_stsbenchmark.py
""" from torch.utils.data import DataLoader import math from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator import logging from datetime import datetime import os import gzip import csv #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #Check if dataset exsist. If not, download and extract it sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) # Read the dataset model_name = 'nli-distilroberta-base-v2' train_batch_size = 16 num_epochs = 4 model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Load a pre-trained sentence transformer model 將數據集轉換為准備訓練的 DataLoader model = SentenceTransformer(model_name) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': dev_samples.append(inp_example) elif row['split'] == 'test': test_samples.append(inp_example) else: train_samples.append(inp_example) train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) # Development set: Measure correlation between cosine score and gold labels 開發集:測量余弦分數和黃金標簽之間的相關性 logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training. We skip evaluation in this example 配置訓練。 我們在這個例子中跳過評估 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up 10% 的列車數據用於熱身 logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset
加載存儲的模型並評估其在 STS 基准數據集上的性能
# ############################################################################## model = SentenceTransformer(model_save_path) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') test_evaluator(model, output_path=model_save_path)
五、Faiss語義檢索
""" This example uses Approximate Nearest Neighbor Search (ANN) with FAISS (https://github.com/facebookresearch/faiss). Searching a large corpus with Millions of embeddings can be time-consuming. To speed this up, ANN can index the existent vectors. For a new query vector, this index can be used to find the nearest neighbors. This nearest neighbor search is not perfect, i.e., it might not perfectly find all top-k nearest neighbors. In this example, we use FAISS with an inverse flat index (IndexIVFFlat). It learns to partition the corpus embeddings into different cluster (number is defined by n_clusters). At search time, the matching cluster for query is found and only vectors in this cluster must be search for nearest neighbors. This script will compare the result from ANN with exact nearest neighbor search and output a Recall@k value as well as the missing results in the top-k hits list. See the FAISS repository, how to install FAISS. As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions (only 100k are used): https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs. As embeddings model, we use the SBERT model 'quora-distilbert-multilingual', that it aligned for 100 languages. I.e., you can type in a question in various languages and it will return the closest questions in the corpus (questions in the corpus are mainly in English).
此示例使用帶有 FAISS (https://github.com/facebookresearch/faiss) 的近似最近鄰搜索 (ANN)。
搜索具有數百萬個嵌入的大型語料庫可能非常耗時。為了加快速度,
ANN 可以索引現有的向量。對於新的查詢向量,該索引可用於查找最近的鄰居。
這種最近鄰搜索並不完美,即它可能無法完美地找到所有前 k 個最近鄰。
在此示例中,我們使用具有反向平坦索引 (IndexIVFFlat) 的 FAISS。它學習划分語料庫嵌入
進入不同的集群(數量由 n_clusters 定義)。在搜索時,找到查詢的匹配簇,並且只有向量
在這個集群中必須搜索最近的鄰居。
此腳本將 ANN 的結果與精確的最近鄰搜索進行比較並輸出 Recall@k 值
以及 top-k 命中列表中缺失的結果。
請參閱 FAISS 存儲庫,了解如何安裝 FAISS。
作為數據集,我們使用 Quora Duplicate Questions 數據集,其中包含大約 500k 個問題(僅使用了 100k):
https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs。
作為嵌入模型,我們使用 SBERT 模型“quora-distilbert-multilingual”,
它與 100 種語言保持一致。也就是說,你可以用各種語言輸入一個問題,它會
返回語料庫中最接近的問題(語料庫中的問題主要是英文)。
""" from sentence_transformers import SentenceTransformer, util import os import csv import pickle import time import faiss import numpy as np model_name = 'quora-distilbert-multilingual' model = SentenceTransformer(model_name) url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" dataset_path = "quora_duplicate_questions.tsv" max_corpus_size = 100000 embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format(model_name.replace('/', '_'), max_corpus_size) embedding_size = 768 #Size of embeddings top_k_hits = 10 #Output k hits #Defining our FAISS index 定義我們的 FAISS 指數 用於faiss的集群數量。 選擇一個值 4*sqrt(N) 到 16*sqrt(N) - https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
#Number of clusters used for faiss. Select a value 4*sqrt(N) to 16*sqrt(N) - https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
n_clusters = 1024 #We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity #我們使用內積(點積)作為索引。 我們將向量歸一化為單位長度,然后內積等於余弦相似度 quantizer = faiss.IndexFlatIP(embedding_size) index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters, faiss.METRIC_INNER_PRODUCT) #Number of clusters to explorer at search time. We will search for nearest neighbors in 3 clusters. 搜索時資源管理器的集群數量。 我們將在 3 個集群中搜索最近的鄰居。 index.nprobe = 3 #Check if embedding cache path exists if not os.path.exists(embedding_cache_path): # Check if the dataset exists. If not, download and extract # Download dataset if needed if not os.path.exists(dataset_path): print("Download dataset") util.http_get(url, dataset_path) # Get all unique sentences from the file corpus_sentences = set() with open(dataset_path, encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL) for row in reader: corpus_sentences.add(row['question1']) if len(corpus_sentences) >= max_corpus_size: break corpus_sentences.add(row['question2']) if len(corpus_sentences) >= max_corpus_size: break corpus_sentences = list(corpus_sentences) print("Encode the corpus. This might take a while") corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True) print("Store file on disc") with open(embedding_cache_path, "wb") as fOut: pickle.dump({'sentences': corpus_sentences, 'embeddings': corpus_embeddings}, fOut) else: print("Load pre-computed embeddings from disc") with open(embedding_cache_path, "rb") as fIn: cache_data = pickle.load(fIn) corpus_sentences = cache_data['sentences'] corpus_embeddings = cache_data['embeddings'] ### Create the FAISS index print("Start creating FAISS index") # First, we need to normalize vectors to unit length corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1)[:, None] # Then we train the index to find a suitable clustering index.train(corpus_embeddings) # Finally we add all embeddings to the index index.add(corpus_embeddings) ######### Search in the index ########### print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences))) while True: inp_question = input("Please enter a question: ") start_time = time.time() question_embedding = model.encode(inp_question) #FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity question_embedding = question_embedding / np.linalg.norm(question_embedding) question_embedding = np.expand_dims(question_embedding, axis=0) # Search in FAISS. It returns a matrix with distances and corpus ids. distances, corpus_ids = index.search(question_embedding, top_k_hits) # We extract corpus ids and scores for the first query hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])] hits = sorted(hits, key=lambda x: x['score'], reverse=True) end_time = time.time() print("Input question:", inp_question) print("Results (after {:.3f} seconds):".format(end_time-start_time)) for hit in hits[0:top_k_hits]: print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']])) # Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity # Here, we compute the recall of ANN compared to the exact results correct_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k_hits)[0] correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits]) ann_corpus_ids = set([hit['corpus_id'] for hit in hits]) if len(ann_corpus_ids) != len(correct_hits_ids): print("Approximate Nearest Neighbor returned a different number of results than expected") recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids) print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100)) if recall < 1: print("Missing results:") for hit in correct_hits[0:top_k_hits]: if hit['corpus_id'] not in ann_corpus_ids: print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']])) print("\n\n========\n")
六、實戰
from torch.utils.data import DataLoader import math from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.readers import InputExample import logging from datetime import datetime import os from root_path import root import pandas as pd class MySentenceBert(): logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) def __init__(self): self.train_batch_size = 16 self.num_epochs = 4 data_path = os.path.join(root, "data", "sim_data") self.train_data = pd.read_csv(os.path.join(data_path, "train.csv"), sep="\t") self.val_data = pd.read_csv(os.path.join(data_path, "val.csv"), sep="\t") self.test_data = pd.read_csv(os.path.join(data_path, "test.csv"), sep="\t") self.model_save_path = os.path.join(root, "chkpt", "sentence_bert_model" + datetime.now().strftime("_%Y_%m_%d_%H_%M")) def data_generator(self): logging.info("generator dataset") train_datas = [] dev_datas = [] test_datas = [] for s1, s2, l in zip(self.train_data["s1"], self.train_data["s2"], self.train_data["y"]): train_datas.append(InputExample(texts=[s1, s2], label=float(l))) for s1, s2, l in zip(self.val_data["s1"], self.val_data["s2"], self.val_data["y"]): dev_datas.append(InputExample(texts=[s1, s2], label=float(l))) for s1, s2, l in zip(self.test_data["s1"], self.test_data["s2"], self.test_data["y"]): test_datas.append(InputExample(texts=[s1, s2], label=float(l))) return train_datas, dev_datas, test_datas def train(self, train_datas, dev_datas, model): train_dataloader = DataLoader(train_datas, shuffle=True, batch_size=self.train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_datas, name='sts-dev') warmup_steps = math.ceil(len(train_dataloader) * self.num_epochs * 0.1) logging.info("Warmup-steps: {}".format(warmup_steps)) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=self.num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=self.model_save_path) def test(self, test_samples): model = SentenceTransformer(self.model_save_path) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') test_evaluator(model, output_path=self.model_save_path) def main(self): train_datas, dev_datas, test_datas = self.data_generator() model_name = os.path.join(root, "chkpt", "bert-base-chinese") word_embedding_model = models.Transformer(model_name) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) self.train(train_datas, dev_datas, model) self.test(test_datas) if __name__ == '__main__': MySentenceBert().main()
from sentence_transformers import SentenceTransformer, util import os import csv import pickle import time from root_path import root import json class SemanticSearch(): def __init__(self): model_name = os.path.join(root, "chkpt", "sentence_bert_model_2021_08_05_18_16") self.model = SentenceTransformer(model_name) embedding_cache_path = 'semantic_search_embedding.pkl' dataset_path = os.path.join(root, "data", "bert_data", "index.txt") with open(os.path.join(root, "config", "code_to_label.json"), "r", encoding="utf8") as f: self.d = json.load(f) self.sentences = list() self.code = list() if not os.path.exists(embedding_cache_path): with open(dataset_path, encoding='utf8') as fIn: for read_line in fIn: read_line = read_line.split("\t") self.sentences.append(read_line[0]) self.code.append(read_line[1].replace("\n", "")) print("Encode the corpus. This might take a while") self.embeddings = self.model.encode(self.sentences, show_progress_bar=True, convert_to_tensor=True) print("Store file on disc") with open(embedding_cache_path, "wb") as fOut: pickle.dump({'sentences': self.sentences, 'embeddings': self.embeddings, "code": self.code}, fOut) else: print("Load pre-computed embeddings from disc") with open(embedding_cache_path, "rb") as fIn: cache_data = pickle.load(fIn) self.sentences = cache_data['sentences'] self.embeddings = cache_data['embeddings'] self.code = cache_data["code"] def query(self, query): inp_question = query question_embedding = self.model.encode(inp_question, convert_to_tensor=True) hits = util.semantic_search(question_embedding, self.embeddings) hit = hits[0][0] # Get the hits for the first query score = hit['score'] text = self.sentences[hit['corpus_id']] kh_code = self.code[hit['corpus_id']] label = self.d[kh_code][1] return label,score,text def main(self): self.query("你好") if __name__ == '__main__': SemanticSearch().main()