Skip to content

Embeddings

pgraf.embeddings

Engine

Bases: Enum

Enum for the available embedding engines

Source code in pgraf/embeddings.py
class Engine(enum.Enum):
    """Enum for the available embedding engines"""

    HUGGING_FACE = 'hugging-face'
    OPENAI = 'openai'

HuggingFace

Handles the generation of vector embeddings for text content.

This class provides functionality to convert text into vector embeddings using sentence transformers. It handles chunking of text to ensure optimal embedding generation.

Parameters:

Name Type Description Default
model str | None

The sentence transformer model to use for embeddings

DEFAULT_HUGGING_FACE_MODEL
Source code in pgraf/embeddings.py
class HuggingFace:
    """Handles the generation of vector embeddings for text content.

    This class provides functionality to convert text into vector embeddings
    using sentence transformers. It handles chunking of text to ensure
    optimal embedding generation.

    Args:
        model: The sentence transformer model to use for embeddings
    """

    def __init__(self, model: str | None = DEFAULT_HUGGING_FACE_MODEL) -> None:
        """Initialize the embeddings generator with the specified model.

        Args:
            model: The sentence transformer model to use
                (defaults to 'all-MiniLM-L6-v2')
        """
        self.transformer = sentence_transformers.SentenceTransformer(
            model or DEFAULT_HUGGING_FACE_MODEL
        )

    def get(self, value: str) -> list[numpy.ndarray]:
        """Generate embeddings for the provided text value.

        The text is automatically chunked into manageable pieces
        using sentence boundaries and maximum word count.

        Args:
            value: The text to generate embeddings for

        Returns:
            A list of numpy arrays containing the embeddings for each chunk
        """
        embeddings: list[numpy.ndarray] = []
        for chunk in _chunk_text(value):
            result: numpy.ndarray = self.transformer.encode(
                chunk, convert_to_numpy=True, convert_to_tensor=False
            )
            embeddings.append(result)
        return embeddings

__init__(model=DEFAULT_HUGGING_FACE_MODEL)

Initialize the embeddings generator with the specified model.

Parameters:

Name Type Description Default
model str | None

The sentence transformer model to use (defaults to 'all-MiniLM-L6-v2')

DEFAULT_HUGGING_FACE_MODEL
Source code in pgraf/embeddings.py
def __init__(self, model: str | None = DEFAULT_HUGGING_FACE_MODEL) -> None:
    """Initialize the embeddings generator with the specified model.

    Args:
        model: The sentence transformer model to use
            (defaults to 'all-MiniLM-L6-v2')
    """
    self.transformer = sentence_transformers.SentenceTransformer(
        model or DEFAULT_HUGGING_FACE_MODEL
    )

get(value)

Generate embeddings for the provided text value.

The text is automatically chunked into manageable pieces using sentence boundaries and maximum word count.

Parameters:

Name Type Description Default
value str

The text to generate embeddings for

required

Returns:

Type Description
list[ndarray]

A list of numpy arrays containing the embeddings for each chunk

Source code in pgraf/embeddings.py
def get(self, value: str) -> list[numpy.ndarray]:
    """Generate embeddings for the provided text value.

    The text is automatically chunked into manageable pieces
    using sentence boundaries and maximum word count.

    Args:
        value: The text to generate embeddings for

    Returns:
        A list of numpy arrays containing the embeddings for each chunk
    """
    embeddings: list[numpy.ndarray] = []
    for chunk in _chunk_text(value):
        result: numpy.ndarray = self.transformer.encode(
            chunk, convert_to_numpy=True, convert_to_tensor=False
        )
        embeddings.append(result)
    return embeddings

OpenAI

Handles the generation of vector embeddings for text content using the OpenAI client

Source code in pgraf/embeddings.py
class OpenAI:
    """Handles the generation of vector embeddings for text content using the
    OpenAI client
    """

    def __init__(
        self,
        model: str | None = DEFAULT_OPENAI_MODEL,
        api_key: str | None = None,
    ) -> None:
        """Initialize the embeddings generator with the specified model."""
        self.client = openai.OpenAI(api_key=api_key)
        self.model = model or DEFAULT_OPENAI_MODEL

    def get(self, value: str) -> list[numpy.ndarray]:
        """Generate embeddings for the provided text value.

        The text is automatically chunked into manageable pieces
        using sentence boundaries and maximum word count.

        Args:
            value: The text to generate embeddings for

        Returns:
            A list of numpy arrays containing the embeddings for each chunk
        """
        embeddings: list[numpy.ndarray] = []
        for chunk in _chunk_text(value):
            response = self.client.embeddings.create(
                input=chunk, model=self.model
            )
            embeddings.append(numpy.array(response.data[0].embedding))
        return embeddings

__init__(model=DEFAULT_OPENAI_MODEL, api_key=None)

Initialize the embeddings generator with the specified model.

Source code in pgraf/embeddings.py
def __init__(
    self,
    model: str | None = DEFAULT_OPENAI_MODEL,
    api_key: str | None = None,
) -> None:
    """Initialize the embeddings generator with the specified model."""
    self.client = openai.OpenAI(api_key=api_key)
    self.model = model or DEFAULT_OPENAI_MODEL

get(value)

Generate embeddings for the provided text value.

The text is automatically chunked into manageable pieces using sentence boundaries and maximum word count.

Parameters:

Name Type Description Default
value str

The text to generate embeddings for

required

Returns:

Type Description
list[ndarray]

A list of numpy arrays containing the embeddings for each chunk

Source code in pgraf/embeddings.py
def get(self, value: str) -> list[numpy.ndarray]:
    """Generate embeddings for the provided text value.

    The text is automatically chunked into manageable pieces
    using sentence boundaries and maximum word count.

    Args:
        value: The text to generate embeddings for

    Returns:
        A list of numpy arrays containing the embeddings for each chunk
    """
    embeddings: list[numpy.ndarray] = []
    for chunk in _chunk_text(value):
        response = self.client.embeddings.create(
            input=chunk, model=self.model
        )
        embeddings.append(numpy.array(response.data[0].embedding))
    return embeddings