Source code for synapse.ml.hf.HuggingFaceSentenceEmbedder

# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from sentence_transformers import SentenceTransformer
from pyspark.ml.functions import predict_batch_udf
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params
from pyspark.sql.types import (
    ArrayType,
    FloatType,
)


[docs]class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): """ Custom transformer that extends PySpark's Transformer class to perform sentence embedding using a model with optional TensorRT acceleration. """ NUM_OPT_ROWS = 100 # Constant for number of rows taken for model optimization BATCH_SIZE_DEFAULT = 64 # Define additional parameters runtime = Param( Params._dummy(), "runtime", "Specifies the runtime environment: cpu, cuda, or tensorrt", ) batchSize = Param(Params._dummy(), "batchSize", "Batch size for embeddings", int) modelName = Param(Params._dummy(), "modelName", "Full Model Name parameter") def __init__( self, inputCol=None, outputCol=None, runtime=None, batchSize=None, modelName=None, ): """ Initialize the HuggingFaceSentenceEmbedder with input/output columns and optional TRT flag. """ super(HuggingFaceSentenceEmbedder, self).__init__() # Determine the default runtime based on CUDA availability default_runtime = "cuda" if torch.cuda.is_available() else "cpu" # Override the provided runtime if CUDA is not available effective_runtime = runtime if torch.cuda.is_available() else "cpu" self._setDefault( runtime=default_runtime, batchSize=self.BATCH_SIZE_DEFAULT, ) self._set( inputCol=inputCol, outputCol=outputCol, runtime=effective_runtime, batchSize=batchSize if batchSize is not None else self.BATCH_SIZE_DEFAULT, modelName=modelName, ) self.optData = None self.model = None # Placeholder for the DataFrame row count check self.row_count = 0 # This should be set when the DataFrame is available # Setter method for batchSize
[docs] def setBatchSize(self, value): self._set(batchSize=value) return self
# Getter method for batchSize
[docs] def getBatchSize(self): return self.getOrDefault(self.batchSize)
# Sets the runtime environment for the model. # Supported values: 'cpu', 'cuda', 'tensorrt'
[docs] def setRuntime(self, value): """ Sets the runtime environment for the model. Supported values: 'cpu', 'cuda', 'tensorrt' """ # if value not in ["cpu", "cuda", "onnxrt", "tensorrt"]: if value not in ["cpu", "cuda", "tensorrt"]: raise ValueError( "Invalid runtime specified. Choose from 'cpu', 'cuda', 'tensorrt'" ) self.setOrDefault(self.runtime, value)
[docs] def getRuntime(self): return self.getOrDefault(self.runtime)
# Setter method for modelName
[docs] def setModelName(self, value): self._set(modelName=value) return self
# Getter method for modelName
[docs] def getModelName(self): return self.getOrDefault(self.modelName)
[docs] def setRowCount(self, row_count): self.row_count = row_count # Override the runtime if row count is less than 100 or CUDA is not available if self.row_count < 100 or not torch.cuda.is_available(): self.set(self.runtime, "cpu") return self
# Optimize the model using Model Navigator with TensorRT configuration. def _optimize(self, model): import tensorrt as trt import model_navigator as nav conf = nav.OptimizeConfig( target_formats=(nav.Format.TENSORRT,), runners=("TensorRT",), optimization_profile=nav.OptimizationProfile( max_batch_size=self.BATCH_SIZE_DEFAULT ), custom_configs=[ nav.TorchConfig(autocast=True), nav.TorchScriptConfig(autocast=True), nav.TensorRTConfig( precision=(nav.TensorRTPrecision.FP16,), onnx_parser_flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM.value], ), ], ) def _get_dataloader(): input_data = self.optData return [ ( 0, ( input_data, {"show_progress_bar": False, "batch_size": self.getBatchSize()}, ), ) ] nav.optimize(model.encode, dataloader=_get_dataloader(), config=conf) def _predict_batch_fn(self): """ Create and return a function for batch prediction. """ runtime = self.getRuntime() if self.model == None: global model modelName = self.getModelName() model = SentenceTransformer( modelName, device="cpu" if runtime == "cpu" else "cuda" ).eval() if runtime in ("tensorrt"): import tensorrt as trt import model_navigator as nav # this forces navigator to use specific runtime nav.inplace_config.strategy = nav.SelectedRuntimeStrategy( "trt-fp16", "TensorRT" ) moduleName = modelName.split("/")[1] model = nav.Module(model, name=moduleName, forward_func="forward") try: nav.load_optimized() except Exception: self._optimize(model) nav.load_optimized() self.model = model def predict(inputs): """ Predict method to encode inputs using the model. """ with torch.no_grad(): output = model.encode( inputs.tolist(), convert_to_tensor=False, show_progress_bar=False ) return output return predict # Method to apply the transformation to the dataset def _transform(self, dataset, spark): """ Apply the transformation to the input dataset. """ input_col = self.getInputCol() output_col = self.getOutputCol() size = dataset.count() self.setRowCount(size) if size >= self.NUM_OPT_ROWS: df = dataset.take(self.NUM_OPT_ROWS) self.optData = [row[input_col] for row in df] encode = predict_batch_udf( self._predict_batch_fn, return_type=ArrayType(FloatType()), batch_size=self.getBatchSize(), ) return dataset.withColumn(output_col, encode(input_col))
[docs] def transform(self, dataset, spark=None): """ Public method to transform the dataset. """ return self._transform(dataset, spark)