Skip to content

EmbeddingWithHuggingFace

Embedding class using HuggingFace model based on LangChain Embeddings class.

EmbeddingWithHuggingFace

Bases: Embeddings

Embedding class using HuggingFace model based on LangChain Embeddings class.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class EmbeddingWithHuggingFace(Embeddings):
    """
    Embedding class using HuggingFace model based on LangChain Embeddings class.
    """

    def __init__(
        self,
        model_name: str,
        model_cache_dir: str = None,
        truncation: bool = True,
        device: str = "cpu",
    ):
        """
        Initialize the EmbeddingWithHuggingFace class.

        Args:
            model_name: The name of the HuggingFace model to be used.
            model_cache_dir: The directory to cache the HuggingFace model.
            truncation: The truncation flag for the HuggingFace tokenizer.
            return_tensors: The return_tensors flag for the HuggingFace tokenizer.
            device: The device to run the model on.
        """

        # Set parameters
        self.model_name = model_name
        self.model_cache_dir = model_cache_dir
        self.truncation = truncation
        self.device = device

        # Try to load the model from HuggingFace Hub
        try:
            AutoConfig.from_pretrained(self.model_name)
        except EnvironmentError as e:
            raise ValueError(
                f"Model {self.model_name} is not available on HuggingFace Hub."
            ) from e

        # Load HuggingFace tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name, cache_dir=self.model_cache_dir
        )
        self.model = AutoModel.from_pretrained(
            self.model_name, cache_dir=self.model_cache_dir
        )

    def meanpooling(self, output, mask) -> torch.Tensor:
        """
        Mean Pooling - Take attention mask into account for correct averaging.
        According to the following documentation:
        https://huggingface.co/NeuML/pubmedbert-base-embeddings

        Args:
            output: The output of the model.
            mask: The mask of the model.
        """
        embeddings = output[0] # First element of model_output contains all token embeddings
        mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
        return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

    def embed_documents(self, texts: List[str]) -> List[float]:
        """
        Generate embedding for a list of input texts using HuggingFace model.

        Args:
            texts: The list of texts to be embedded.

        Returns:
            The list of embeddings for the given texts.
        """

        # Generate the embedding
        with torch.no_grad():
            inputs = self.tokenizer(
                texts,
                padding=True,
                truncation=self.truncation,
                return_tensors="pt",
            ).to(self.device)
            outputs = self.model.to(self.device)(**inputs)
            embeddings = self.meanpooling(outputs, inputs['attention_mask']).cpu()

        return embeddings

    def embed_query(self, text: str) -> List[float]:
        """
        Generate embeddings for an input text using HuggingFace model.

        Args:
            text: A query to be embedded.
        Returns:
            The embeddings for the given query.
        """

        # Generate the embedding
        with torch.no_grad():
            inputs = self.tokenizer(
                text,
                padding=True,
                truncation=self.truncation,
                return_tensors="pt",
            ).to(self.device)
            outputs = self.model.to(self.device)(**inputs)
            embeddings = self.meanpooling(outputs, inputs['attention_mask']).cpu()[0]

        return embeddings

__init__(model_name, model_cache_dir=None, truncation=True, device='cpu')

Initialize the EmbeddingWithHuggingFace class.

Parameters:

Name Type Description Default
model_name str

The name of the HuggingFace model to be used.

required
model_cache_dir str

The directory to cache the HuggingFace model.

None
truncation bool

The truncation flag for the HuggingFace tokenizer.

True
return_tensors

The return_tensors flag for the HuggingFace tokenizer.

required
device str

The device to run the model on.

'cpu'
Source code in aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(
    self,
    model_name: str,
    model_cache_dir: str = None,
    truncation: bool = True,
    device: str = "cpu",
):
    """
    Initialize the EmbeddingWithHuggingFace class.

    Args:
        model_name: The name of the HuggingFace model to be used.
        model_cache_dir: The directory to cache the HuggingFace model.
        truncation: The truncation flag for the HuggingFace tokenizer.
        return_tensors: The return_tensors flag for the HuggingFace tokenizer.
        device: The device to run the model on.
    """

    # Set parameters
    self.model_name = model_name
    self.model_cache_dir = model_cache_dir
    self.truncation = truncation
    self.device = device

    # Try to load the model from HuggingFace Hub
    try:
        AutoConfig.from_pretrained(self.model_name)
    except EnvironmentError as e:
        raise ValueError(
            f"Model {self.model_name} is not available on HuggingFace Hub."
        ) from e

    # Load HuggingFace tokenizer and model
    self.tokenizer = AutoTokenizer.from_pretrained(
        self.model_name, cache_dir=self.model_cache_dir
    )
    self.model = AutoModel.from_pretrained(
        self.model_name, cache_dir=self.model_cache_dir
    )

embed_documents(texts)

Generate embedding for a list of input texts using HuggingFace model.

Parameters:

Name Type Description Default
texts List[str]

The list of texts to be embedded.

required

Returns:

Type Description
List[float]

The list of embeddings for the given texts.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def embed_documents(self, texts: List[str]) -> List[float]:
    """
    Generate embedding for a list of input texts using HuggingFace model.

    Args:
        texts: The list of texts to be embedded.

    Returns:
        The list of embeddings for the given texts.
    """

    # Generate the embedding
    with torch.no_grad():
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=self.truncation,
            return_tensors="pt",
        ).to(self.device)
        outputs = self.model.to(self.device)(**inputs)
        embeddings = self.meanpooling(outputs, inputs['attention_mask']).cpu()

    return embeddings

embed_query(text)

Generate embeddings for an input text using HuggingFace model.

Parameters:

Name Type Description Default
text str

A query to be embedded.

required

Returns: The embeddings for the given query.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def embed_query(self, text: str) -> List[float]:
    """
    Generate embeddings for an input text using HuggingFace model.

    Args:
        text: A query to be embedded.
    Returns:
        The embeddings for the given query.
    """

    # Generate the embedding
    with torch.no_grad():
        inputs = self.tokenizer(
            text,
            padding=True,
            truncation=self.truncation,
            return_tensors="pt",
        ).to(self.device)
        outputs = self.model.to(self.device)(**inputs)
        embeddings = self.meanpooling(outputs, inputs['attention_mask']).cpu()[0]

    return embeddings

meanpooling(output, mask)

Mean Pooling - Take attention mask into account for correct averaging. According to the following documentation: https://huggingface.co/NeuML/pubmedbert-base-embeddings

Parameters:

Name Type Description Default
output

The output of the model.

required
mask

The mask of the model.

required
Source code in aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py
55
56
57
58
59
60
61
62
63
64
65
66
67
def meanpooling(self, output, mask) -> torch.Tensor:
    """
    Mean Pooling - Take attention mask into account for correct averaging.
    According to the following documentation:
    https://huggingface.co/NeuML/pubmedbert-base-embeddings

    Args:
        output: The output of the model.
        mask: The mask of the model.
    """
    embeddings = output[0] # First element of model_output contains all token embeddings
    mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)