You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

56 lines
1.6 KiB

"""
This file contains deprecated code that can only be used with the old `model.fit`-style Sentence Transformers v2.X training.
It exists for backwards compatibility with the `model.old_fit` method, but will be removed in a future version.
Nowadays, with Sentence Transformers v3+, it is recommended to use the `SentenceTransformerTrainer` class to train models.
See https://www.sbert.net/docs/sentence_transformer/training_overview.html for more information.
Instead, you should create a `datasets` `Dataset` for training: https://huggingface.co/docs/datasets/create_dataset
"""
from __future__ import annotations
import gzip
from . import InputExample
class PairedFilesReader:
"""Reads in the a Pair Dataset, split in two files"""
def __init__(self, filepaths):
self.filepaths = filepaths
def get_examples(self, max_examples=0):
fIns = []
for filepath in self.filepaths:
fIn = (
gzip.open(filepath, "rt", encoding="utf-8")
if filepath.endswith(".gz")
else open(filepath, encoding="utf-8")
)
fIns.append(fIn)
examples = []
eof = False
while not eof:
texts = []
for fIn in fIns:
text = fIn.readline()
if text == "":
eof = True
break
texts.append(text)
if eof:
break
examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
if max_examples > 0 and len(examples) >= max_examples:
break
return examples