Huggingface
zenml.integrations.huggingface
special
Initialization of the Huggingface integration.
HuggingfaceIntegration (Integration)
Definition of Huggingface integration for ZenML.
Source code in zenml/integrations/huggingface/__init__.py
class HuggingfaceIntegration(Integration):
"""Definition of Huggingface integration for ZenML."""
NAME = HUGGINGFACE
REQUIREMENTS = ["transformers", "datasets"]
@classmethod
def activate(cls) -> None:
"""Activates the integration."""
from zenml.integrations.huggingface import materializers # noqa
activate()
classmethod
Activates the integration.
Source code in zenml/integrations/huggingface/__init__.py
@classmethod
def activate(cls) -> None:
"""Activates the integration."""
from zenml.integrations.huggingface import materializers # noqa
materializers
special
Initialization of Huggingface materializers.
huggingface_datasets_materializer
Implementation of the Huggingface datasets materializer.
HFDatasetMaterializer (BaseMaterializer)
Materializer to read data to and from huggingface datasets.
Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
class HFDatasetMaterializer(BaseMaterializer):
"""Materializer to read data to and from huggingface datasets."""
ASSOCIATED_TYPES = (Dataset, DatasetDict)
ASSOCIATED_ARTIFACT_TYPES = (DataArtifact,)
def handle_input(self, data_type: Type[Any]) -> Dataset:
"""Reads Dataset.
Args:
data_type: The type of the dataset to read.
Returns:
The dataset read from the specified dir.
"""
super().handle_input(data_type)
temp_dir = TemporaryDirectory()
io_utils.copy_dir(
os.path.join(self.artifact.uri, DEFAULT_DATASET_DIR),
temp_dir.name,
)
return load_from_disk(temp_dir.name)
def handle_return(self, ds: Type[Any]) -> None:
"""Writes a Dataset to the specified dir.
Args:
ds: The Dataset to write.
"""
super().handle_return(ds)
with TemporaryDirectory() as temp_dir:
ds.save_to_disk(temp_dir)
io_utils.copy_dir(
temp_dir,
os.path.join(self.artifact.uri, DEFAULT_DATASET_DIR),
)
handle_input(self, data_type)
Reads Dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_type |
Type[Any] |
The type of the dataset to read. |
required |
Returns:
Type | Description |
---|---|
Dataset |
The dataset read from the specified dir. |
Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def handle_input(self, data_type: Type[Any]) -> Dataset:
"""Reads Dataset.
Args:
data_type: The type of the dataset to read.
Returns:
The dataset read from the specified dir.
"""
super().handle_input(data_type)
temp_dir = TemporaryDirectory()
io_utils.copy_dir(
os.path.join(self.artifact.uri, DEFAULT_DATASET_DIR),
temp_dir.name,
)
return load_from_disk(temp_dir.name)
handle_return(self, ds)
Writes a Dataset to the specified dir.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
ds |
Type[Any] |
The Dataset to write. |
required |
Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def handle_return(self, ds: Type[Any]) -> None:
"""Writes a Dataset to the specified dir.
Args:
ds: The Dataset to write.
"""
super().handle_return(ds)
with TemporaryDirectory() as temp_dir:
ds.save_to_disk(temp_dir)
io_utils.copy_dir(
temp_dir,
os.path.join(self.artifact.uri, DEFAULT_DATASET_DIR),
)
huggingface_pt_model_materializer
Implementation of the Huggingface PyTorch model materializer.
HFPTModelMaterializer (BaseMaterializer)
Materializer to read torch model to and from huggingface pretrained model.
Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
class HFPTModelMaterializer(BaseMaterializer):
"""Materializer to read torch model to and from huggingface pretrained model."""
ASSOCIATED_TYPES = (PreTrainedModel,)
ASSOCIATED_ARTIFACT_TYPES = (ModelArtifact,)
def handle_input(self, data_type: Type[Any]) -> PreTrainedModel:
"""Reads HFModel.
Args:
data_type: The type of the model to read.
Returns:
The model read from the specified dir.
"""
super().handle_input(data_type)
config = AutoConfig.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR)
)
architecture = config.architectures[0]
model_cls = getattr(
importlib.import_module("transformers"), architecture
)
return model_cls.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR)
)
def handle_return(self, model: Type[Any]) -> None:
"""Writes a Model to the specified dir.
Args:
model: The Torch Model to write.
"""
super().handle_return(model)
temp_dir = TemporaryDirectory()
model.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR),
)
handle_input(self, data_type)
Reads HFModel.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_type |
Type[Any] |
The type of the model to read. |
required |
Returns:
Type | Description |
---|---|
PreTrainedModel |
The model read from the specified dir. |
Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def handle_input(self, data_type: Type[Any]) -> PreTrainedModel:
"""Reads HFModel.
Args:
data_type: The type of the model to read.
Returns:
The model read from the specified dir.
"""
super().handle_input(data_type)
config = AutoConfig.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR)
)
architecture = config.architectures[0]
model_cls = getattr(
importlib.import_module("transformers"), architecture
)
return model_cls.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR)
)
handle_return(self, model)
Writes a Model to the specified dir.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
Type[Any] |
The Torch Model to write. |
required |
Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def handle_return(self, model: Type[Any]) -> None:
"""Writes a Model to the specified dir.
Args:
model: The Torch Model to write.
"""
super().handle_return(model)
temp_dir = TemporaryDirectory()
model.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR),
)
huggingface_tf_model_materializer
Implementation of the Huggingface TF model materializer.
HFTFModelMaterializer (BaseMaterializer)
Materializer to read Tensorflow model to and from huggingface pretrained model.
Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
class HFTFModelMaterializer(BaseMaterializer):
"""Materializer to read Tensorflow model to and from huggingface pretrained model."""
ASSOCIATED_TYPES = (TFPreTrainedModel,)
ASSOCIATED_ARTIFACT_TYPES = (ModelArtifact,)
def handle_input(self, data_type: Type[Any]) -> TFPreTrainedModel:
"""Reads HFModel.
Args:
data_type: The type of the model to read.
Returns:
The model read from the specified dir.
"""
super().handle_input(data_type)
config = AutoConfig.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR)
)
architecture = "TF" + config.architectures[0]
model_cls = getattr(
importlib.import_module("transformers"), architecture
)
return model_cls.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR)
)
def handle_return(self, model: Type[Any]) -> None:
"""Writes a Model to the specified dir.
Args:
model: The TF Model to write.
"""
super().handle_return(model)
temp_dir = TemporaryDirectory()
model.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR),
)
handle_input(self, data_type)
Reads HFModel.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_type |
Type[Any] |
The type of the model to read. |
required |
Returns:
Type | Description |
---|---|
TFPreTrainedModel |
The model read from the specified dir. |
Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def handle_input(self, data_type: Type[Any]) -> TFPreTrainedModel:
"""Reads HFModel.
Args:
data_type: The type of the model to read.
Returns:
The model read from the specified dir.
"""
super().handle_input(data_type)
config = AutoConfig.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR)
)
architecture = "TF" + config.architectures[0]
model_cls = getattr(
importlib.import_module("transformers"), architecture
)
return model_cls.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR)
)
handle_return(self, model)
Writes a Model to the specified dir.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
Type[Any] |
The TF Model to write. |
required |
Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def handle_return(self, model: Type[Any]) -> None:
"""Writes a Model to the specified dir.
Args:
model: The TF Model to write.
"""
super().handle_return(model)
temp_dir = TemporaryDirectory()
model.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR),
)
huggingface_tokenizer_materializer
Implementation of the Huggingface tokenizer materializer.
HFTokenizerMaterializer (BaseMaterializer)
Materializer to read tokenizer to and from huggingface tokenizer.
Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
class HFTokenizerMaterializer(BaseMaterializer):
"""Materializer to read tokenizer to and from huggingface tokenizer."""
ASSOCIATED_TYPES = (PreTrainedTokenizerBase,)
ASSOCIATED_ARTIFACT_TYPES = (ModelArtifact,)
def handle_input(self, data_type: Type[Any]) -> PreTrainedTokenizerBase:
"""Reads Tokenizer.
Args:
data_type: The type of the tokenizer to read.
Returns:
The tokenizer read from the specified dir.
"""
super().handle_input(data_type)
return AutoTokenizer.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_TOKENIZER_DIR)
)
def handle_return(self, tokenizer: Type[Any]) -> None:
"""Writes a Tokenizer to the specified dir.
Args:
tokenizer: The HFTokenizer to write.
"""
super().handle_return(tokenizer)
temp_dir = TemporaryDirectory()
tokenizer.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.artifact.uri, DEFAULT_TOKENIZER_DIR),
)
handle_input(self, data_type)
Reads Tokenizer.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_type |
Type[Any] |
The type of the tokenizer to read. |
required |
Returns:
Type | Description |
---|---|
PreTrainedTokenizerBase |
The tokenizer read from the specified dir. |
Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
def handle_input(self, data_type: Type[Any]) -> PreTrainedTokenizerBase:
"""Reads Tokenizer.
Args:
data_type: The type of the tokenizer to read.
Returns:
The tokenizer read from the specified dir.
"""
super().handle_input(data_type)
return AutoTokenizer.from_pretrained(
os.path.join(self.artifact.uri, DEFAULT_TOKENIZER_DIR)
)
handle_return(self, tokenizer)
Writes a Tokenizer to the specified dir.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tokenizer |
Type[Any] |
The HFTokenizer to write. |
required |
Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
def handle_return(self, tokenizer: Type[Any]) -> None:
"""Writes a Tokenizer to the specified dir.
Args:
tokenizer: The HFTokenizer to write.
"""
super().handle_return(tokenizer)
temp_dir = TemporaryDirectory()
tokenizer.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.artifact.uri, DEFAULT_TOKENIZER_DIR),
)