You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
56 lines
1.6 KiB
56 lines
1.6 KiB
"""
|
|
This file contains deprecated code that can only be used with the old `model.fit`-style Sentence Transformers v2.X training.
|
|
It exists for backwards compatibility with the `model.old_fit` method, but will be removed in a future version.
|
|
|
|
Nowadays, with Sentence Transformers v3+, it is recommended to use the `SentenceTransformerTrainer` class to train models.
|
|
See https://www.sbert.net/docs/sentence_transformer/training_overview.html for more information.
|
|
|
|
Instead, you should create a `datasets` `Dataset` for training: https://huggingface.co/docs/datasets/create_dataset
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import gzip
|
|
|
|
from . import InputExample
|
|
|
|
|
|
class PairedFilesReader:
|
|
"""Reads in the a Pair Dataset, split in two files"""
|
|
|
|
def __init__(self, filepaths):
|
|
self.filepaths = filepaths
|
|
|
|
def get_examples(self, max_examples=0):
|
|
fIns = []
|
|
for filepath in self.filepaths:
|
|
fIn = (
|
|
gzip.open(filepath, "rt", encoding="utf-8")
|
|
if filepath.endswith(".gz")
|
|
else open(filepath, encoding="utf-8")
|
|
)
|
|
fIns.append(fIn)
|
|
|
|
examples = []
|
|
|
|
eof = False
|
|
while not eof:
|
|
texts = []
|
|
for fIn in fIns:
|
|
text = fIn.readline()
|
|
|
|
if text == "":
|
|
eof = True
|
|
break
|
|
|
|
texts.append(text)
|
|
|
|
if eof:
|
|
break
|
|
|
|
examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
|
|
if max_examples > 0 and len(examples) >= max_examples:
|
|
break
|
|
|
|
return examples
|