PCST (Milvus Multimodal)

Exctraction of multimodal subgraph using Prize-Collecting Steiner Tree (PCST) algorithm.

`DynamicLibraryLoader`

Dynamically load libraries based on system capabilities.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

class DynamicLibraryLoader:
    """Dynamically load libraries based on system capabilities."""

    def __init__(self, detector: SystemDetector):
        self.detector = detector
        self.use_gpu = detector.use_gpu

        # Initialize attributes that will be set later
        self.py = None
        self.df = None
        self.pd = None
        self.np = None
        self.cudf = None
        self.cp = None

        # Import libraries based on system capabilities
        self._import_libraries()

        # Dynamic settings based on hardware
        self.normalize_vectors = self.use_gpu  # Only normalize for GPU
        self.metric_type = "IP" if self.use_gpu else "COSINE"

        logger.info("Library Configuration:")
        logger.info("  Using GPU acceleration: %s", self.use_gpu)
        logger.info("  Vector normalization: %s", self.normalize_vectors)
        logger.info("  Metric type: %s", self.metric_type)

    def _import_libraries(self):
        """Dynamically import libraries based on system capabilities."""
        # Set base libraries
        self.pd = pd
        self.np = np

        # Conditionally import GPU libraries
        if self.detector.use_gpu:
            if CUDF_AVAILABLE:
                self.cudf = cudf
                self.cp = cp
                self.py = cp  # Use cupy for array operations
                self.df = cudf  # Use cudf for dataframes
                logger.info("Successfully imported GPU libraries (cudf, cupy)")
            else:
                logger.error("cudf or cupy not found. Falling back to CPU mode.")
                self.detector.use_gpu = False
                self.use_gpu = False
                self._setup_cpu_mode()
        else:
            self._setup_cpu_mode()

    def _setup_cpu_mode(self):
        """Setup CPU mode with numpy and pandas."""
        self.py = self.np  # Use numpy for array operations
        self.df = self.pd  # Use pandas for dataframes
        self.normalize_vectors = False
        self.metric_type = "COSINE"
        logger.info("Using CPU mode with numpy and pandas")

    def normalize_matrix(self, matrix, axis: int = 1):
        """Normalize matrix using appropriate library."""
        if not self.normalize_vectors:
            return matrix

        if self.use_gpu:
            # Use cupy for GPU
            matrix_cp = self.cp.asarray(matrix).astype(self.cp.float32)
            norms = self.cp.linalg.norm(matrix_cp, axis=axis, keepdims=True)
            return matrix_cp / norms
        # CPU mode doesn't normalize for COSINE similarity
        return matrix

    def to_list(self, data):
        """Convert data to list format."""
        if hasattr(data, "tolist"):
            return data.tolist()
        if hasattr(data, "to_arrow"):
            return data.to_arrow().to_pylist()
        return list(data)

`_import_libraries()`

Dynamically import libraries based on system capabilities.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def _import_libraries(self):
    """Dynamically import libraries based on system capabilities."""
    # Set base libraries
    self.pd = pd
    self.np = np

    # Conditionally import GPU libraries
    if self.detector.use_gpu:
        if CUDF_AVAILABLE:
            self.cudf = cudf
            self.cp = cp
            self.py = cp  # Use cupy for array operations
            self.df = cudf  # Use cudf for dataframes
            logger.info("Successfully imported GPU libraries (cudf, cupy)")
        else:
            logger.error("cudf or cupy not found. Falling back to CPU mode.")
            self.detector.use_gpu = False
            self.use_gpu = False
            self._setup_cpu_mode()
    else:
        self._setup_cpu_mode()

`_setup_cpu_mode()`

Setup CPU mode with numpy and pandas.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def _setup_cpu_mode(self):
    """Setup CPU mode with numpy and pandas."""
    self.py = self.np  # Use numpy for array operations
    self.df = self.pd  # Use pandas for dataframes
    self.normalize_vectors = False
    self.metric_type = "COSINE"
    logger.info("Using CPU mode with numpy and pandas")

`normalize_matrix(matrix, axis=1)`

Normalize matrix using appropriate library.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def normalize_matrix(self, matrix, axis: int = 1):
    """Normalize matrix using appropriate library."""
    if not self.normalize_vectors:
        return matrix

    if self.use_gpu:
        # Use cupy for GPU
        matrix_cp = self.cp.asarray(matrix).astype(self.cp.float32)
        norms = self.cp.linalg.norm(matrix_cp, axis=axis, keepdims=True)
        return matrix_cp / norms
    # CPU mode doesn't normalize for COSINE similarity
    return matrix

`to_list(data)`

Convert data to list format.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def to_list(self, data):
    """Convert data to list format."""
    if hasattr(data, "tolist"):
        return data.tolist()
    if hasattr(data, "to_arrow"):
        return data.to_arrow().to_pylist()
    return list(data)

`MultimodalPCSTPruning`

Bases: NamedTuple

Prize-Collecting Steiner Tree (PCST) pruning algorithm implementation inspired by G-Retriever (He et al., 'G-Retriever: Retrieval-Augmented Generation for Textual Graph Understanding and Question Answering', NeurIPS 2024) paper. https://arxiv.org/abs/2402.07630 https://github.com/XiaoxinHe/G-Retriever/blob/main/src/dataset/utils/retrieval.py

Parameters:

Name	Description	Default
`topk`	The number of top nodes to consider.	required
`topk_e`	The number of top edges to consider.	required
`cost_e`	The cost of the edges.	required
`c_const`	The constant value for the cost of the edges computation.	required
`root`	The root node of the subgraph, -1 for unrooted.	required
`num_clusters`	The number of clusters.	required
`pruning`	The pruning strategy to use.	required
`verbosity_level`	The verbosity level.	required
`use_description`	Whether to use description embeddings.	required
`metric_type`	The similarity metric type (dynamic based on hardware).	required
`loader`	The dynamic library loader instance.	required

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

class MultimodalPCSTPruning(NamedTuple):
    """
    Prize-Collecting Steiner Tree (PCST) pruning algorithm implementation inspired by G-Retriever
    (He et al., 'G-Retriever: Retrieval-Augmented Generation for Textual Graph Understanding and
    Question Answering', NeurIPS 2024) paper.
    https://arxiv.org/abs/2402.07630
    https://github.com/XiaoxinHe/G-Retriever/blob/main/src/dataset/utils/retrieval.py

    Args:
        topk: The number of top nodes to consider.
        topk_e: The number of top edges to consider.
        cost_e: The cost of the edges.
        c_const: The constant value for the cost of the edges computation.
        root: The root node of the subgraph, -1 for unrooted.
        num_clusters: The number of clusters.
        pruning: The pruning strategy to use.
        verbosity_level: The verbosity level.
        use_description: Whether to use description embeddings.
        metric_type: The similarity metric type (dynamic based on hardware).
        loader: The dynamic library loader instance.
    """

    topk: int = 3
    topk_e: int = 3
    cost_e: float = 0.5
    c_const: float = 0.01
    root: int = -1
    num_clusters: int = 1
    pruning: str = "gw"
    verbosity_level: int = 0
    use_description: bool = False
    metric_type: str = None  # Will be set dynamically
    loader: DynamicLibraryLoader = None

    def prepare_collections(self, cfg: dict, modality: str) -> dict:
        """
        Prepare the collections for nodes, node-type specific nodes, and edges in Milvus.

        Args:
            cfg: The configuration dictionary containing the Milvus setup.
            modality: The modality to use for the subgraph extraction.

        Returns:
            A dictionary containing the collections of nodes, node-type specific nodes, and edges.
        """
        # Initialize the collections dictionary
        colls = {}

        # Load the collection for nodes
        colls["nodes"] = Collection(name=f"{cfg.milvus_db.database_name}_nodes")

        if modality != "prompt":
            # Load the collection for the specific node type
            colls["nodes_type"] = Collection(
                f"{cfg.milvus_db.database_name}_nodes_{modality.replace('/', '_')}"
            )

        # Load the collection for edges
        colls["edges"] = Collection(name=f"{cfg.milvus_db.database_name}_edges")

        # Load the collections
        for coll in colls.values():
            coll.load()

        return colls

    async def load_edge_index_async(self, cfg: dict, _connection_manager=None) -> np.ndarray:
        """
        Load edge index using hybrid async/sync approach to avoid event loop issues.

        This method queries the edges collection to get head_index and tail_index,
        eliminating the need for pickle caching and reducing memory usage.

        Args:
            cfg: The configuration dictionary containing the Milvus setup.
            _connection_manager: Unused parameter for interface compatibility.

        Returns:
            numpy.ndarray: Edge index array with shape [2, num_edges]
        """
        logger.log(logging.INFO, "Loading edge index from Milvus collection (hybrid)")

        def load_edges_sync():
            """Load edges synchronously to avoid event loop issues."""

            collection_name = f"{cfg.milvus_db.database_name}_edges"
            edges_collection = Collection(name=collection_name)
            edges_collection.load()

            # Query all edges in batches
            batch_size = getattr(cfg.milvus_db, "query_batch_size", 10000)
            total_entities = edges_collection.num_entities
            logger.log(logging.INFO, "Total edges to process: %d", total_entities)

            head_list = []
            tail_list = []

            for start in range(0, total_entities, batch_size):
                end = min(start + batch_size, total_entities)
                logger.debug("Processing edge batch: %d to %d", start, end)

                batch = edges_collection.query(
                    expr=f"triplet_index >= {start} and triplet_index < {end}",
                    output_fields=["head_index", "tail_index"],
                )

                head_list.extend([r["head_index"] for r in batch])
                tail_list.extend([r["tail_index"] for r in batch])

            # Convert to numpy array format expected by PCST
            edge_index = self.loader.py.array([head_list, tail_list])
            logger.log(
                logging.INFO,
                "Edge index loaded (hybrid): shape %s",
                str(edge_index.shape),
            )

            return edge_index

        # Run in thread to avoid event loop conflicts
        return await asyncio.to_thread(load_edges_sync)

    def load_edge_index(self, cfg: dict) -> np.ndarray:
        """
        Load edge index synchronously from Milvus collection.

        This method queries the edges collection to get head_index and tail_index.

        Args:
            cfg: The configuration dictionary containing the Milvus setup.

        Returns:
            numpy.ndarray: Edge index array with shape [2, num_edges]
        """
        logger.log(logging.INFO, "Loading edge index from Milvus collection (sync)")

        collection_name = f"{cfg.milvus_db.database_name}_edges"
        edges_collection = Collection(name=collection_name)
        edges_collection.load()

        # Query all edges in batches
        batch_size = getattr(cfg.milvus_db, "query_batch_size", 10000)
        total_entities = edges_collection.num_entities
        logger.log(logging.INFO, "Total edges to process: %d", total_entities)

        head_list = []
        tail_list = []

        for start in range(0, total_entities, batch_size):
            end = min(start + batch_size, total_entities)
            logger.debug("Processing edge batch: %d to %d", start, end)

            batch = edges_collection.query(
                expr=f"triplet_index >= {start} and triplet_index < {end}",
                output_fields=["head_index", "tail_index"],
            )

            head_list.extend([r["head_index"] for r in batch])
            tail_list.extend([r["tail_index"] for r in batch])

        # Convert to numpy array format expected by PCST
        edge_index = self.loader.py.array([head_list, tail_list])
        logger.log(
            logging.INFO,
            "Edge index loaded (sync): shape %s",
            str(edge_index.shape),
        )

        return edge_index

    def _compute_node_prizes(self, query_emb: list, colls: dict) -> dict:
        """
        Compute the node prizes based on the similarity between the query and nodes.

        Args:
            query_emb: The query embedding. This can be an embedding of
                a prompt, sequence, or any other feature to be used for the subgraph extraction.
            colls: The collections of nodes, node-type specific nodes, and edges in Milvus.

        Returns:
            The prizes of the nodes.
        """
        # Initialize several variables
        topk = min(self.topk, colls["nodes"].num_entities)
        n_prizes = self.loader.py.zeros(colls["nodes"].num_entities, dtype=self.loader.py.float32)

        # Get the actual metric type to use
        actual_metric_type = self.metric_type or self.loader.metric_type

        # Calculate similarity for text features and update the score
        if self.use_description:
            # Search the collection with the text embedding
            res = colls["nodes"].search(
                data=[query_emb],
                anns_field="desc_emb",
                param={"metric_type": actual_metric_type},
                limit=topk,
                output_fields=["node_id"],
            )
        else:
            # Search the collection with the query embedding
            res = colls["nodes_type"].search(
                data=[query_emb],
                anns_field="feat_emb",
                param={"metric_type": actual_metric_type},
                limit=topk,
                output_fields=["node_id"],
            )

        # Update the prizes based on the search results
        n_prizes[[r.id for r in res[0]]] = self.loader.py.arange(topk, 0, -1).astype(
            self.loader.py.float32
        )

        return n_prizes

    async def _compute_node_prizes_async(
        self,
        query_emb: list,
        collection_name: str,
        connection_manager,
        use_description: bool = False,
    ) -> dict:
        """
        Compute the node prizes asynchronously using connection manager.

        Args:
            query_emb: The query embedding
            collection_name: Name of the collection to search
            connection_manager: The MilvusConnectionManager instance
            use_description: Whether to use description embeddings

        Returns:
            The prizes of the nodes
        """
        # Get collection stats for initialization
        stats = await connection_manager.async_get_collection_stats(collection_name)
        num_entities = stats["num_entities"]

        # Initialize prizes array
        topk = min(self.topk, num_entities)
        n_prizes = self.loader.py.zeros(num_entities, dtype=self.loader.py.float32)

        # Get the actual metric type to use
        actual_metric_type = self.metric_type or self.loader.metric_type

        # Determine search field based on use_description
        anns_field = "desc_emb" if use_description else "feat_emb"

        # Perform async search
        results = await connection_manager.async_search(
            collection_name=collection_name,
            data=[query_emb],
            anns_field=anns_field,
            param={"metric_type": actual_metric_type},
            limit=topk,
            output_fields=["node_id"],
        )

        # Update the prizes based on the search results
        if results and len(results) > 0:
            result_ids = [hit["id"] for hit in results[0]]
            n_prizes[result_ids] = self.loader.py.arange(topk, 0, -1).astype(self.loader.py.float32)

        return n_prizes

    def _compute_edge_prizes(self, text_emb: list, colls: dict):
        """
        Compute the edge prizes based on the similarity between the query and edges.

        Args:
            text_emb: The textual description embedding.
            colls: The collections of nodes, node-type specific nodes, and edges in Milvus.

        Returns:
            The prizes of the edges.
        """
        # Initialize several variables
        topk_e = min(self.topk_e, colls["edges"].num_entities)
        e_prizes = self.loader.py.zeros(colls["edges"].num_entities, dtype=self.loader.py.float32)

        # Get the actual metric type to use
        actual_metric_type = self.metric_type or self.loader.metric_type

        # Search the collection with the query embedding
        res = colls["edges"].search(
            data=[text_emb],
            anns_field="feat_emb",
            param={"metric_type": actual_metric_type},
            limit=topk_e,  # Only retrieve the top-k edges
            output_fields=["head_id", "tail_id"],
        )

        # Update the prizes based on the search results
        e_prizes[[r.id for r in res[0]]] = [r.score for r in res[0]]

        # Further process the edge_prizes
        unique_prizes, inverse_indices = self.loader.py.unique(e_prizes, return_inverse=True)
        topk_e_values = unique_prizes[self.loader.py.argsort(-unique_prizes)[:topk_e]]
        last_topk_e_value = topk_e
        for k in range(topk_e):
            indices = inverse_indices == (unique_prizes == topk_e_values[k]).nonzero()[0]
            value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
            e_prizes[indices] = value
            last_topk_e_value = value * (1 - self.c_const)

        return e_prizes

    async def _compute_edge_prizes_async(
        self, text_emb: list, collection_name: str, connection_manager
    ) -> dict:
        """
        Compute the edge prizes asynchronously using connection manager.

        Args:
            text_emb: The textual description embedding
            collection_name: Name of the edges collection
            connection_manager: The MilvusConnectionManager instance

        Returns:
            The prizes of the edges
        """
        # Get collection stats for initialization
        stats = await connection_manager.async_get_collection_stats(collection_name)
        num_entities = stats["num_entities"]

        # Initialize prizes array
        topk_e = min(self.topk_e, num_entities)
        e_prizes = self.loader.py.zeros(num_entities, dtype=self.loader.py.float32)

        # Get the actual metric type to use
        actual_metric_type = self.metric_type or self.loader.metric_type

        # Perform async search
        results = await connection_manager.async_search(
            collection_name=collection_name,
            data=[text_emb],
            anns_field="feat_emb",
            param={"metric_type": actual_metric_type},
            limit=topk_e,
            output_fields=["head_id", "tail_id"],
        )

        # Update the prizes based on the search results
        if results and len(results) > 0:
            result_ids = [hit["id"] for hit in results[0]]
            result_scores = [hit["distance"] for hit in results[0]]  # Use distance/score
            e_prizes[result_ids] = result_scores

        # Process edge prizes using helper method
        return self._process_edge_prizes(e_prizes, topk_e)

    def _process_edge_prizes(self, e_prizes, topk_e):
        """Helper method to process edge prizes and reduce complexity."""
        unique_prizes, inverse_indices = self.loader.py.unique(e_prizes, return_inverse=True)
        sorted_indices = self.loader.py.argsort(-unique_prizes)[:topk_e]
        topk_e_values = unique_prizes[sorted_indices]
        last_topk_e_value = topk_e

        for k in range(topk_e):
            indices = inverse_indices == (unique_prizes == topk_e_values[k]).nonzero()[0]
            value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
            e_prizes[indices] = value
            last_topk_e_value = value * (1 - self.c_const)

        return e_prizes

    def compute_prizes(self, text_emb: list, query_emb: list, colls: dict) -> dict:
        """
        Compute the node prizes based on the cosine similarity between the query and nodes,
        as well as the edge prizes based on the cosine similarity between the query and edges.
        Note that the node and edge embeddings shall use the same embedding model and dimensions
        with the query.

        Args:
            text_emb: The textual description embedding.
            query_emb: The query embedding. This can be an embedding of
                a prompt, sequence, or any other feature to be used for the subgraph extraction.
            colls: The collections of nodes, node-type specific nodes, and edges in Milvus.

        Returns:
            The prizes of the nodes and edges.
        """
        # Compute prizes for nodes
        logger.log(logging.INFO, "_compute_node_prizes")
        n_prizes = self._compute_node_prizes(query_emb, colls)

        # Compute prizes for edges
        logger.log(logging.INFO, "_compute_edge_prizes")
        e_prizes = self._compute_edge_prizes(text_emb, colls)

        return {"nodes": n_prizes, "edges": e_prizes}

    async def compute_prizes_async(
        self, text_emb: list, query_emb: list, cfg: dict, modality: str
    ) -> dict:
        """
        Compute node and edge prizes asynchronously in parallel using sync fallback.

        Args:
            text_emb: The textual description embedding
            query_emb: The query embedding
            cfg: The configuration dictionary containing the Milvus setup
            modality: The modality to use for the subgraph extraction

        Returns:
            The prizes of the nodes and edges
        """
        logger.log(logging.INFO, "Computing prizes in parallel (hybrid async/sync)")

        # Use existing sync method wrapped in asyncio.to_thread
        colls = self.prepare_collections(cfg, modality)
        return await asyncio.to_thread(self.compute_prizes, text_emb, query_emb, colls)

    def compute_subgraph_costs(self, edge_index, num_nodes: int, prizes: dict):
        """
        Compute the costs in constructing the subgraph proposed by G-Retriever paper.

        Args:
            edge_index: The edge index of the graph, consisting of source and destination nodes.
            num_nodes: The number of nodes in the graph.
            prizes: The prizes of the nodes and the edges.

        Returns:
            edges: The edges of the subgraph, consisting of edges and number of edges without
                virtual edges.
            prizes: The prizes of the subgraph.
            costs: The costs of the subgraph.
        """
        # Initialize several variables
        real_ = {}
        virt_ = {}

        # Update edge cost threshold
        updated_cost_e = min(
            self.cost_e,
            self.loader.py.max(prizes["edges"]).item() * (1 - self.c_const / 2),
        )

        # Masks for real and virtual edges
        logger.log(logging.INFO, "Creating masks for real and virtual edges")
        real_["mask"] = prizes["edges"] <= updated_cost_e
        virt_["mask"] = ~real_["mask"]

        # Real edge indices
        logger.log(logging.INFO, "Computing real edges")
        real_["indices"] = self.loader.py.nonzero(real_["mask"])[0]
        real_["src"] = edge_index[0][real_["indices"]]
        real_["dst"] = edge_index[1][real_["indices"]]
        real_["edges"] = self.loader.py.stack([real_["src"], real_["dst"]], axis=1)
        real_["costs"] = updated_cost_e - prizes["edges"][real_["indices"]]

        # Edge index mapping: local real edge idx -> original global index
        logger.log(logging.INFO, "Creating mapping for real edges")
        mapping_edges = dict(
            zip(range(len(real_["indices"])), self.loader.to_list(real_["indices"]), strict=False)
        )

        # Virtual edge handling
        logger.log(logging.INFO, "Computing virtual edges")
        virt_["indices"] = self.loader.py.nonzero(virt_["mask"])[0]
        virt_["src"] = edge_index[0][virt_["indices"]]
        virt_["dst"] = edge_index[1][virt_["indices"]]
        virt_["prizes"] = prizes["edges"][virt_["indices"]] - updated_cost_e

        # Generate virtual node IDs
        logger.log(logging.INFO, "Generating virtual node IDs")
        virt_["num"] = virt_["indices"].shape[0]
        virt_["node_ids"] = self.loader.py.arange(num_nodes, num_nodes + virt_["num"])

        # Virtual edges: (src → virtual), (virtual → dst)
        logger.log(logging.INFO, "Creating virtual edges")
        virt_["edges_1"] = self.loader.py.stack([virt_["src"], virt_["node_ids"]], axis=1)
        virt_["edges_2"] = self.loader.py.stack([virt_["node_ids"], virt_["dst"]], axis=1)
        virt_["edges"] = self.loader.py.concatenate([virt_["edges_1"], virt_["edges_2"]], axis=0)
        virt_["costs"] = self.loader.py.zeros(
            (virt_["edges"].shape[0],), dtype=real_["costs"].dtype
        )

        # Combine real and virtual edges/costs
        logger.log(logging.INFO, "Combining real and virtual edges/costs")
        all_edges = self.loader.py.concatenate([real_["edges"], virt_["edges"]], axis=0)
        all_costs = self.loader.py.concatenate([real_["costs"], virt_["costs"]], axis=0)

        # Final prizes
        logger.log(logging.INFO, "Getting final prizes")
        final_prizes = self.loader.py.concatenate([prizes["nodes"], virt_["prizes"]], axis=0)

        # Mapping virtual node ID -> edge index in original graph
        logger.log(logging.INFO, "Creating mapping for virtual nodes")
        mapping_nodes = dict(
            zip(
                self.loader.to_list(virt_["node_ids"]),
                self.loader.to_list(virt_["indices"]),
                strict=False,
            )
        )

        # Build return values
        logger.log(logging.INFO, "Building return values")
        edges_dict = {
            "edges": all_edges,
            "num_prior_edges": real_["edges"].shape[0],
        }
        mapping = {
            "edges": mapping_edges,
            "nodes": mapping_nodes,
        }

        return edges_dict, final_prizes, all_costs, mapping

    def get_subgraph_nodes_edges(
        self, num_nodes: int, vertices, edges_dict: dict, mapping: dict
    ) -> dict:
        """
        Get the selected nodes and edges of the subgraph based on the vertices and edges computed
        by the PCST algorithm.

        Args:
            num_nodes: The number of nodes in the graph.
            vertices: The vertices selected by the PCST algorithm.
            edges_dict: A dictionary containing the edges and the number of prior edges.
            mapping: A dictionary containing the mapping of nodes and edges.

        Returns:
            The selected nodes and edges of the extracted subgraph.
        """
        # Get edges information
        edges = edges_dict["edges"]
        num_prior_edges = edges_dict["num_prior_edges"]

        # Retrieve the selected nodes and edges based on the given vertices and edges
        subgraph_nodes = vertices[vertices < num_nodes]
        subgraph_edges = [mapping["edges"][e.item()] for e in edges if e < num_prior_edges]
        virtual_vertices = vertices[vertices >= num_nodes]
        if len(virtual_vertices) > 0:
            virtual_edges = [mapping["nodes"][i.item()] for i in virtual_vertices]
            subgraph_edges = self.loader.py.array(subgraph_edges + virtual_edges)
        edge_index = edges_dict["edge_index"][:, subgraph_edges]
        subgraph_nodes = self.loader.py.unique(
            self.loader.py.concatenate([subgraph_nodes, edge_index[0], edge_index[1]])
        )

        return {"nodes": subgraph_nodes, "edges": subgraph_edges}

    def extract_subgraph(self, text_emb: list, query_emb: list, modality: str, cfg: dict) -> dict:
        """
        Perform the Prize-Collecting Steiner Tree (PCST) algorithm to extract the subgraph.

        Args:
            text_emb: The textual description embedding.
            query_emb: The query embedding. This can be an embedding of
                a prompt, sequence, or any other feature to be used for the subgraph extraction.
            modality: The modality to use for the subgraph extraction
                (e.g., "text", "sequence", "smiles").
            cfg: The configuration dictionary containing the Milvus setup.

        Returns:
            The selected nodes and edges of the subgraph.
        """
        # Load the collections for nodes
        logger.log(logging.INFO, "Preparing collections")
        colls = self.prepare_collections(cfg, modality)

        # Load edge index directly from Milvus (replaces pickle cache)
        logger.log(logging.INFO, "Loading edge index from Milvus")
        edge_index = self.load_edge_index(cfg)

        # Assert the topk and topk_e values for subgraph retrieval
        assert self.topk > 0, "topk must be greater than or equal to 0"
        assert self.topk_e > 0, "topk_e must be greater than or equal to 0"

        # Retrieve the top-k nodes and edges based on the query embedding
        logger.log(logging.INFO, "compute_prizes")
        prizes = self.compute_prizes(text_emb, query_emb, colls)

        # Compute costs in constructing the subgraph
        logger.log(logging.INFO, "compute_subgraph_costs")
        edges_dict, prizes, costs, mapping = self.compute_subgraph_costs(
            edge_index, colls["nodes"].num_entities, prizes
        )

        # Retrieve the subgraph using the PCST algorithm
        logger.log(logging.INFO, "Running PCST algorithm")
        result_vertices, result_edges = pcst_fast.pcst_fast(
            edges_dict["edges"].tolist(),
            prizes.tolist(),
            costs.tolist(),
            self.root,
            self.num_clusters,
            self.pruning,
            self.verbosity_level,
        )

        # Get subgraph nodes and edges based on the result of the PCST algorithm
        logger.log(logging.INFO, "Getting subgraph nodes and edges")
        subgraph = self.get_subgraph_nodes_edges(
            colls["nodes"].num_entities,
            self.loader.py.asarray(result_vertices),
            {
                "edges": self.loader.py.asarray(result_edges),
                "num_prior_edges": edges_dict["num_prior_edges"],
                "edge_index": edge_index,
            },
            mapping,
        )
        print(subgraph)

        return subgraph

`_compute_edge_prizes(text_emb, colls)`

Compute the edge prizes based on the similarity between the query and edges.

Parameters:

Name	Type	Description	Default
`text_emb`	`list`	The textual description embedding.	required
`colls`	`dict`	The collections of nodes, node-type specific nodes, and edges in Milvus.	required

Returns:

Type	Description
	The prizes of the edges.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def _compute_edge_prizes(self, text_emb: list, colls: dict):
    """
    Compute the edge prizes based on the similarity between the query and edges.

    Args:
        text_emb: The textual description embedding.
        colls: The collections of nodes, node-type specific nodes, and edges in Milvus.

    Returns:
        The prizes of the edges.
    """
    # Initialize several variables
    topk_e = min(self.topk_e, colls["edges"].num_entities)
    e_prizes = self.loader.py.zeros(colls["edges"].num_entities, dtype=self.loader.py.float32)

    # Get the actual metric type to use
    actual_metric_type = self.metric_type or self.loader.metric_type

    # Search the collection with the query embedding
    res = colls["edges"].search(
        data=[text_emb],
        anns_field="feat_emb",
        param={"metric_type": actual_metric_type},
        limit=topk_e,  # Only retrieve the top-k edges
        output_fields=["head_id", "tail_id"],
    )

    # Update the prizes based on the search results
    e_prizes[[r.id for r in res[0]]] = [r.score for r in res[0]]

    # Further process the edge_prizes
    unique_prizes, inverse_indices = self.loader.py.unique(e_prizes, return_inverse=True)
    topk_e_values = unique_prizes[self.loader.py.argsort(-unique_prizes)[:topk_e]]
    last_topk_e_value = topk_e
    for k in range(topk_e):
        indices = inverse_indices == (unique_prizes == topk_e_values[k]).nonzero()[0]
        value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
        e_prizes[indices] = value
        last_topk_e_value = value * (1 - self.c_const)

    return e_prizes

`_compute_edge_prizes_async(text_emb, collection_name, connection_manager)` `async`

Compute the edge prizes asynchronously using connection manager.

Parameters:

Name	Type	Description	Default
`text_emb`	`list`	The textual description embedding	required
`collection_name`	`str`	Name of the edges collection	required
`connection_manager`		The MilvusConnectionManager instance	required

Returns:

Type	Description
`dict`	The prizes of the edges

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

async def _compute_edge_prizes_async(
    self, text_emb: list, collection_name: str, connection_manager
) -> dict:
    """
    Compute the edge prizes asynchronously using connection manager.

    Args:
        text_emb: The textual description embedding
        collection_name: Name of the edges collection
        connection_manager: The MilvusConnectionManager instance

    Returns:
        The prizes of the edges
    """
    # Get collection stats for initialization
    stats = await connection_manager.async_get_collection_stats(collection_name)
    num_entities = stats["num_entities"]

    # Initialize prizes array
    topk_e = min(self.topk_e, num_entities)
    e_prizes = self.loader.py.zeros(num_entities, dtype=self.loader.py.float32)

    # Get the actual metric type to use
    actual_metric_type = self.metric_type or self.loader.metric_type

    # Perform async search
    results = await connection_manager.async_search(
        collection_name=collection_name,
        data=[text_emb],
        anns_field="feat_emb",
        param={"metric_type": actual_metric_type},
        limit=topk_e,
        output_fields=["head_id", "tail_id"],
    )

    # Update the prizes based on the search results
    if results and len(results) > 0:
        result_ids = [hit["id"] for hit in results[0]]
        result_scores = [hit["distance"] for hit in results[0]]  # Use distance/score
        e_prizes[result_ids] = result_scores

    # Process edge prizes using helper method
    return self._process_edge_prizes(e_prizes, topk_e)

`_compute_node_prizes(query_emb, colls)`

Compute the node prizes based on the similarity between the query and nodes.

Parameters:

Name	Type	Description	Default
`query_emb`	`list`	The query embedding. This can be an embedding of a prompt, sequence, or any other feature to be used for the subgraph extraction.	required
`colls`	`dict`	The collections of nodes, node-type specific nodes, and edges in Milvus.	required

Returns:

Type	Description
`dict`	The prizes of the nodes.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def _compute_node_prizes(self, query_emb: list, colls: dict) -> dict:
    """
    Compute the node prizes based on the similarity between the query and nodes.

    Args:
        query_emb: The query embedding. This can be an embedding of
            a prompt, sequence, or any other feature to be used for the subgraph extraction.
        colls: The collections of nodes, node-type specific nodes, and edges in Milvus.

    Returns:
        The prizes of the nodes.
    """
    # Initialize several variables
    topk = min(self.topk, colls["nodes"].num_entities)
    n_prizes = self.loader.py.zeros(colls["nodes"].num_entities, dtype=self.loader.py.float32)

    # Get the actual metric type to use
    actual_metric_type = self.metric_type or self.loader.metric_type

    # Calculate similarity for text features and update the score
    if self.use_description:
        # Search the collection with the text embedding
        res = colls["nodes"].search(
            data=[query_emb],
            anns_field="desc_emb",
            param={"metric_type": actual_metric_type},
            limit=topk,
            output_fields=["node_id"],
        )
    else:
        # Search the collection with the query embedding
        res = colls["nodes_type"].search(
            data=[query_emb],
            anns_field="feat_emb",
            param={"metric_type": actual_metric_type},
            limit=topk,
            output_fields=["node_id"],
        )

    # Update the prizes based on the search results
    n_prizes[[r.id for r in res[0]]] = self.loader.py.arange(topk, 0, -1).astype(
        self.loader.py.float32
    )

    return n_prizes

`_compute_node_prizes_async(query_emb, collection_name, connection_manager, use_description=False)` `async`

Compute the node prizes asynchronously using connection manager.

Parameters:

Name	Type	Description	Default
`query_emb`	`list`	The query embedding	required
`collection_name`	`str`	Name of the collection to search	required
`connection_manager`		The MilvusConnectionManager instance	required
`use_description`	`bool`	Whether to use description embeddings	`False`

Returns:

Type	Description
`dict`	The prizes of the nodes

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

async def _compute_node_prizes_async(
    self,
    query_emb: list,
    collection_name: str,
    connection_manager,
    use_description: bool = False,
) -> dict:
    """
    Compute the node prizes asynchronously using connection manager.

    Args:
        query_emb: The query embedding
        collection_name: Name of the collection to search
        connection_manager: The MilvusConnectionManager instance
        use_description: Whether to use description embeddings

    Returns:
        The prizes of the nodes
    """
    # Get collection stats for initialization
    stats = await connection_manager.async_get_collection_stats(collection_name)
    num_entities = stats["num_entities"]

    # Initialize prizes array
    topk = min(self.topk, num_entities)
    n_prizes = self.loader.py.zeros(num_entities, dtype=self.loader.py.float32)

    # Get the actual metric type to use
    actual_metric_type = self.metric_type or self.loader.metric_type

    # Determine search field based on use_description
    anns_field = "desc_emb" if use_description else "feat_emb"

    # Perform async search
    results = await connection_manager.async_search(
        collection_name=collection_name,
        data=[query_emb],
        anns_field=anns_field,
        param={"metric_type": actual_metric_type},
        limit=topk,
        output_fields=["node_id"],
    )

    # Update the prizes based on the search results
    if results and len(results) > 0:
        result_ids = [hit["id"] for hit in results[0]]
        n_prizes[result_ids] = self.loader.py.arange(topk, 0, -1).astype(self.loader.py.float32)

    return n_prizes

`_process_edge_prizes(e_prizes, topk_e)`

Helper method to process edge prizes and reduce complexity.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def _process_edge_prizes(self, e_prizes, topk_e):
    """Helper method to process edge prizes and reduce complexity."""
    unique_prizes, inverse_indices = self.loader.py.unique(e_prizes, return_inverse=True)
    sorted_indices = self.loader.py.argsort(-unique_prizes)[:topk_e]
    topk_e_values = unique_prizes[sorted_indices]
    last_topk_e_value = topk_e

    for k in range(topk_e):
        indices = inverse_indices == (unique_prizes == topk_e_values[k]).nonzero()[0]
        value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
        e_prizes[indices] = value
        last_topk_e_value = value * (1 - self.c_const)

    return e_prizes

`compute_prizes(text_emb, query_emb, colls)`

Compute the node prizes based on the cosine similarity between the query and nodes, as well as the edge prizes based on the cosine similarity between the query and edges. Note that the node and edge embeddings shall use the same embedding model and dimensions with the query.

Parameters:

Name	Type	Description	Default
`text_emb`	`list`	The textual description embedding.	required
`query_emb`	`list`	The query embedding. This can be an embedding of a prompt, sequence, or any other feature to be used for the subgraph extraction.	required
`colls`	`dict`	The collections of nodes, node-type specific nodes, and edges in Milvus.	required

Returns:

Type	Description
`dict`	The prizes of the nodes and edges.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def compute_prizes(self, text_emb: list, query_emb: list, colls: dict) -> dict:
    """
    Compute the node prizes based on the cosine similarity between the query and nodes,
    as well as the edge prizes based on the cosine similarity between the query and edges.
    Note that the node and edge embeddings shall use the same embedding model and dimensions
    with the query.

    Args:
        text_emb: The textual description embedding.
        query_emb: The query embedding. This can be an embedding of
            a prompt, sequence, or any other feature to be used for the subgraph extraction.
        colls: The collections of nodes, node-type specific nodes, and edges in Milvus.

    Returns:
        The prizes of the nodes and edges.
    """
    # Compute prizes for nodes
    logger.log(logging.INFO, "_compute_node_prizes")
    n_prizes = self._compute_node_prizes(query_emb, colls)

    # Compute prizes for edges
    logger.log(logging.INFO, "_compute_edge_prizes")
    e_prizes = self._compute_edge_prizes(text_emb, colls)

    return {"nodes": n_prizes, "edges": e_prizes}

`compute_prizes_async(text_emb, query_emb, cfg, modality)` `async`

Compute node and edge prizes asynchronously in parallel using sync fallback.

Parameters:

Name	Type	Description	Default
`text_emb`	`list`	The textual description embedding	required
`query_emb`	`list`	The query embedding	required
`cfg`	`dict`	The configuration dictionary containing the Milvus setup	required
`modality`	`str`	The modality to use for the subgraph extraction	required

Returns:

Type	Description
`dict`	The prizes of the nodes and edges

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

async def compute_prizes_async(
    self, text_emb: list, query_emb: list, cfg: dict, modality: str
) -> dict:
    """
    Compute node and edge prizes asynchronously in parallel using sync fallback.

    Args:
        text_emb: The textual description embedding
        query_emb: The query embedding
        cfg: The configuration dictionary containing the Milvus setup
        modality: The modality to use for the subgraph extraction

    Returns:
        The prizes of the nodes and edges
    """
    logger.log(logging.INFO, "Computing prizes in parallel (hybrid async/sync)")

    # Use existing sync method wrapped in asyncio.to_thread
    colls = self.prepare_collections(cfg, modality)
    return await asyncio.to_thread(self.compute_prizes, text_emb, query_emb, colls)

`compute_subgraph_costs(edge_index, num_nodes, prizes)`

Compute the costs in constructing the subgraph proposed by G-Retriever paper.

Parameters:

Name	Type	Description	Default
`edge_index`		The edge index of the graph, consisting of source and destination nodes.	required
`num_nodes`	`int`	The number of nodes in the graph.	required
`prizes`	`dict`	The prizes of the nodes and the edges.	required

Returns:

Name	Type	Description
`edges`		The edges of the subgraph, consisting of edges and number of edges without virtual edges.
`prizes`		The prizes of the subgraph.
`costs`		The costs of the subgraph.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def compute_subgraph_costs(self, edge_index, num_nodes: int, prizes: dict):
    """
    Compute the costs in constructing the subgraph proposed by G-Retriever paper.

    Args:
        edge_index: The edge index of the graph, consisting of source and destination nodes.
        num_nodes: The number of nodes in the graph.
        prizes: The prizes of the nodes and the edges.

    Returns:
        edges: The edges of the subgraph, consisting of edges and number of edges without
            virtual edges.
        prizes: The prizes of the subgraph.
        costs: The costs of the subgraph.
    """
    # Initialize several variables
    real_ = {}
    virt_ = {}

    # Update edge cost threshold
    updated_cost_e = min(
        self.cost_e,
        self.loader.py.max(prizes["edges"]).item() * (1 - self.c_const / 2),
    )

    # Masks for real and virtual edges
    logger.log(logging.INFO, "Creating masks for real and virtual edges")
    real_["mask"] = prizes["edges"] <= updated_cost_e
    virt_["mask"] = ~real_["mask"]

    # Real edge indices
    logger.log(logging.INFO, "Computing real edges")
    real_["indices"] = self.loader.py.nonzero(real_["mask"])[0]
    real_["src"] = edge_index[0][real_["indices"]]
    real_["dst"] = edge_index[1][real_["indices"]]
    real_["edges"] = self.loader.py.stack([real_["src"], real_["dst"]], axis=1)
    real_["costs"] = updated_cost_e - prizes["edges"][real_["indices"]]

    # Edge index mapping: local real edge idx -> original global index
    logger.log(logging.INFO, "Creating mapping for real edges")
    mapping_edges = dict(
        zip(range(len(real_["indices"])), self.loader.to_list(real_["indices"]), strict=False)
    )

    # Virtual edge handling
    logger.log(logging.INFO, "Computing virtual edges")
    virt_["indices"] = self.loader.py.nonzero(virt_["mask"])[0]
    virt_["src"] = edge_index[0][virt_["indices"]]
    virt_["dst"] = edge_index[1][virt_["indices"]]
    virt_["prizes"] = prizes["edges"][virt_["indices"]] - updated_cost_e

    # Generate virtual node IDs
    logger.log(logging.INFO, "Generating virtual node IDs")
    virt_["num"] = virt_["indices"].shape[0]
    virt_["node_ids"] = self.loader.py.arange(num_nodes, num_nodes + virt_["num"])

    # Virtual edges: (src → virtual), (virtual → dst)
    logger.log(logging.INFO, "Creating virtual edges")
    virt_["edges_1"] = self.loader.py.stack([virt_["src"], virt_["node_ids"]], axis=1)
    virt_["edges_2"] = self.loader.py.stack([virt_["node_ids"], virt_["dst"]], axis=1)
    virt_["edges"] = self.loader.py.concatenate([virt_["edges_1"], virt_["edges_2"]], axis=0)
    virt_["costs"] = self.loader.py.zeros(
        (virt_["edges"].shape[0],), dtype=real_["costs"].dtype
    )

    # Combine real and virtual edges/costs
    logger.log(logging.INFO, "Combining real and virtual edges/costs")
    all_edges = self.loader.py.concatenate([real_["edges"], virt_["edges"]], axis=0)
    all_costs = self.loader.py.concatenate([real_["costs"], virt_["costs"]], axis=0)

    # Final prizes
    logger.log(logging.INFO, "Getting final prizes")
    final_prizes = self.loader.py.concatenate([prizes["nodes"], virt_["prizes"]], axis=0)

    # Mapping virtual node ID -> edge index in original graph
    logger.log(logging.INFO, "Creating mapping for virtual nodes")
    mapping_nodes = dict(
        zip(
            self.loader.to_list(virt_["node_ids"]),
            self.loader.to_list(virt_["indices"]),
            strict=False,
        )
    )

    # Build return values
    logger.log(logging.INFO, "Building return values")
    edges_dict = {
        "edges": all_edges,
        "num_prior_edges": real_["edges"].shape[0],
    }
    mapping = {
        "edges": mapping_edges,
        "nodes": mapping_nodes,
    }

    return edges_dict, final_prizes, all_costs, mapping

`extract_subgraph(text_emb, query_emb, modality, cfg)`

Perform the Prize-Collecting Steiner Tree (PCST) algorithm to extract the subgraph.

Parameters:

Name	Type	Description	Default
`text_emb`	`list`	The textual description embedding.	required
`query_emb`	`list`	The query embedding. This can be an embedding of a prompt, sequence, or any other feature to be used for the subgraph extraction.	required
`modality`	`str`	The modality to use for the subgraph extraction (e.g., "text", "sequence", "smiles").	required
`cfg`	`dict`	The configuration dictionary containing the Milvus setup.	required

Returns:

Type	Description
`dict`	The selected nodes and edges of the subgraph.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def extract_subgraph(self, text_emb: list, query_emb: list, modality: str, cfg: dict) -> dict:
    """
    Perform the Prize-Collecting Steiner Tree (PCST) algorithm to extract the subgraph.

    Args:
        text_emb: The textual description embedding.
        query_emb: The query embedding. This can be an embedding of
            a prompt, sequence, or any other feature to be used for the subgraph extraction.
        modality: The modality to use for the subgraph extraction
            (e.g., "text", "sequence", "smiles").
        cfg: The configuration dictionary containing the Milvus setup.

    Returns:
        The selected nodes and edges of the subgraph.
    """
    # Load the collections for nodes
    logger.log(logging.INFO, "Preparing collections")
    colls = self.prepare_collections(cfg, modality)

    # Load edge index directly from Milvus (replaces pickle cache)
    logger.log(logging.INFO, "Loading edge index from Milvus")
    edge_index = self.load_edge_index(cfg)

    # Assert the topk and topk_e values for subgraph retrieval
    assert self.topk > 0, "topk must be greater than or equal to 0"
    assert self.topk_e > 0, "topk_e must be greater than or equal to 0"

    # Retrieve the top-k nodes and edges based on the query embedding
    logger.log(logging.INFO, "compute_prizes")
    prizes = self.compute_prizes(text_emb, query_emb, colls)

    # Compute costs in constructing the subgraph
    logger.log(logging.INFO, "compute_subgraph_costs")
    edges_dict, prizes, costs, mapping = self.compute_subgraph_costs(
        edge_index, colls["nodes"].num_entities, prizes
    )

    # Retrieve the subgraph using the PCST algorithm
    logger.log(logging.INFO, "Running PCST algorithm")
    result_vertices, result_edges = pcst_fast.pcst_fast(
        edges_dict["edges"].tolist(),
        prizes.tolist(),
        costs.tolist(),
        self.root,
        self.num_clusters,
        self.pruning,
        self.verbosity_level,
    )

    # Get subgraph nodes and edges based on the result of the PCST algorithm
    logger.log(logging.INFO, "Getting subgraph nodes and edges")
    subgraph = self.get_subgraph_nodes_edges(
        colls["nodes"].num_entities,
        self.loader.py.asarray(result_vertices),
        {
            "edges": self.loader.py.asarray(result_edges),
            "num_prior_edges": edges_dict["num_prior_edges"],
            "edge_index": edge_index,
        },
        mapping,
    )
    print(subgraph)

    return subgraph

`get_subgraph_nodes_edges(num_nodes, vertices, edges_dict, mapping)`

Get the selected nodes and edges of the subgraph based on the vertices and edges computed by the PCST algorithm.

Parameters:

Name	Type	Description	Default
`num_nodes`	`int`	The number of nodes in the graph.	required
`vertices`		The vertices selected by the PCST algorithm.	required
`edges_dict`	`dict`	A dictionary containing the edges and the number of prior edges.	required
`mapping`	`dict`	A dictionary containing the mapping of nodes and edges.	required

Returns:

Type	Description
`dict`	The selected nodes and edges of the extracted subgraph.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def get_subgraph_nodes_edges(
    self, num_nodes: int, vertices, edges_dict: dict, mapping: dict
) -> dict:
    """
    Get the selected nodes and edges of the subgraph based on the vertices and edges computed
    by the PCST algorithm.

    Args:
        num_nodes: The number of nodes in the graph.
        vertices: The vertices selected by the PCST algorithm.
        edges_dict: A dictionary containing the edges and the number of prior edges.
        mapping: A dictionary containing the mapping of nodes and edges.

    Returns:
        The selected nodes and edges of the extracted subgraph.
    """
    # Get edges information
    edges = edges_dict["edges"]
    num_prior_edges = edges_dict["num_prior_edges"]

    # Retrieve the selected nodes and edges based on the given vertices and edges
    subgraph_nodes = vertices[vertices < num_nodes]
    subgraph_edges = [mapping["edges"][e.item()] for e in edges if e < num_prior_edges]
    virtual_vertices = vertices[vertices >= num_nodes]
    if len(virtual_vertices) > 0:
        virtual_edges = [mapping["nodes"][i.item()] for i in virtual_vertices]
        subgraph_edges = self.loader.py.array(subgraph_edges + virtual_edges)
    edge_index = edges_dict["edge_index"][:, subgraph_edges]
    subgraph_nodes = self.loader.py.unique(
        self.loader.py.concatenate([subgraph_nodes, edge_index[0], edge_index[1]])
    )

    return {"nodes": subgraph_nodes, "edges": subgraph_edges}

`load_edge_index(cfg)`

Load edge index synchronously from Milvus collection.

This method queries the edges collection to get head_index and tail_index.

Parameters:

Name	Type	Description	Default
`cfg`	`dict`	The configuration dictionary containing the Milvus setup.	required

Returns:

Type	Description
`ndarray`	numpy.ndarray: Edge index array with shape [2, num_edges]

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def load_edge_index(self, cfg: dict) -> np.ndarray:
    """
    Load edge index synchronously from Milvus collection.

    This method queries the edges collection to get head_index and tail_index.

    Args:
        cfg: The configuration dictionary containing the Milvus setup.

    Returns:
        numpy.ndarray: Edge index array with shape [2, num_edges]
    """
    logger.log(logging.INFO, "Loading edge index from Milvus collection (sync)")

    collection_name = f"{cfg.milvus_db.database_name}_edges"
    edges_collection = Collection(name=collection_name)
    edges_collection.load()

    # Query all edges in batches
    batch_size = getattr(cfg.milvus_db, "query_batch_size", 10000)
    total_entities = edges_collection.num_entities
    logger.log(logging.INFO, "Total edges to process: %d", total_entities)

    head_list = []
    tail_list = []

    for start in range(0, total_entities, batch_size):
        end = min(start + batch_size, total_entities)
        logger.debug("Processing edge batch: %d to %d", start, end)

        batch = edges_collection.query(
            expr=f"triplet_index >= {start} and triplet_index < {end}",
            output_fields=["head_index", "tail_index"],
        )

        head_list.extend([r["head_index"] for r in batch])
        tail_list.extend([r["tail_index"] for r in batch])

    # Convert to numpy array format expected by PCST
    edge_index = self.loader.py.array([head_list, tail_list])
    logger.log(
        logging.INFO,
        "Edge index loaded (sync): shape %s",
        str(edge_index.shape),
    )

    return edge_index

`load_edge_index_async(cfg, _connection_manager=None)` `async`

Load edge index using hybrid async/sync approach to avoid event loop issues.

This method queries the edges collection to get head_index and tail_index, eliminating the need for pickle caching and reducing memory usage.

Parameters:

Name	Type	Description	Default
`cfg`	`dict`	The configuration dictionary containing the Milvus setup.	required
`_connection_manager`		Unused parameter for interface compatibility.	`None`

Returns:

Type	Description
`ndarray`	numpy.ndarray: Edge index array with shape [2, num_edges]

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

async def load_edge_index_async(self, cfg: dict, _connection_manager=None) -> np.ndarray:
    """
    Load edge index using hybrid async/sync approach to avoid event loop issues.

    This method queries the edges collection to get head_index and tail_index,
    eliminating the need for pickle caching and reducing memory usage.

    Args:
        cfg: The configuration dictionary containing the Milvus setup.
        _connection_manager: Unused parameter for interface compatibility.

    Returns:
        numpy.ndarray: Edge index array with shape [2, num_edges]
    """
    logger.log(logging.INFO, "Loading edge index from Milvus collection (hybrid)")

    def load_edges_sync():
        """Load edges synchronously to avoid event loop issues."""

        collection_name = f"{cfg.milvus_db.database_name}_edges"
        edges_collection = Collection(name=collection_name)
        edges_collection.load()

        # Query all edges in batches
        batch_size = getattr(cfg.milvus_db, "query_batch_size", 10000)
        total_entities = edges_collection.num_entities
        logger.log(logging.INFO, "Total edges to process: %d", total_entities)

        head_list = []
        tail_list = []

        for start in range(0, total_entities, batch_size):
            end = min(start + batch_size, total_entities)
            logger.debug("Processing edge batch: %d to %d", start, end)

            batch = edges_collection.query(
                expr=f"triplet_index >= {start} and triplet_index < {end}",
                output_fields=["head_index", "tail_index"],
            )

            head_list.extend([r["head_index"] for r in batch])
            tail_list.extend([r["tail_index"] for r in batch])

        # Convert to numpy array format expected by PCST
        edge_index = self.loader.py.array([head_list, tail_list])
        logger.log(
            logging.INFO,
            "Edge index loaded (hybrid): shape %s",
            str(edge_index.shape),
        )

        return edge_index

    # Run in thread to avoid event loop conflicts
    return await asyncio.to_thread(load_edges_sync)

`prepare_collections(cfg, modality)`

Prepare the collections for nodes, node-type specific nodes, and edges in Milvus.

Parameters:

Name	Type	Description	Default
`cfg`	`dict`	The configuration dictionary containing the Milvus setup.	required
`modality`	`str`	The modality to use for the subgraph extraction.	required

Returns:

Type	Description
`dict`	A dictionary containing the collections of nodes, node-type specific nodes, and edges.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def prepare_collections(self, cfg: dict, modality: str) -> dict:
    """
    Prepare the collections for nodes, node-type specific nodes, and edges in Milvus.

    Args:
        cfg: The configuration dictionary containing the Milvus setup.
        modality: The modality to use for the subgraph extraction.

    Returns:
        A dictionary containing the collections of nodes, node-type specific nodes, and edges.
    """
    # Initialize the collections dictionary
    colls = {}

    # Load the collection for nodes
    colls["nodes"] = Collection(name=f"{cfg.milvus_db.database_name}_nodes")

    if modality != "prompt":
        # Load the collection for the specific node type
        colls["nodes_type"] = Collection(
            f"{cfg.milvus_db.database_name}_nodes_{modality.replace('/', '_')}"
        )

    # Load the collection for edges
    colls["edges"] = Collection(name=f"{cfg.milvus_db.database_name}_edges")

    # Load the collections
    for coll in colls.values():
        coll.load()

    return colls

`SystemDetector`

Detect system capabilities and choose appropriate libraries.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

class SystemDetector:
    """Detect system capabilities and choose appropriate libraries."""

    def __init__(self):
        self.os_type = platform.system().lower()  # 'windows', 'linux', 'darwin'
        self.architecture = platform.machine().lower()  # 'x86_64', 'arm64', etc.
        self.has_nvidia_gpu = self._detect_nvidia_gpu()
        self.use_gpu = self.has_nvidia_gpu and self.os_type != "darwin"  # No CUDA on macOS

        logger.info("System Detection Results:")
        logger.info("  OS: %s", self.os_type)
        logger.info("  Architecture: %s", self.architecture)
        logger.info("  NVIDIA GPU detected: %s", self.has_nvidia_gpu)
        logger.info("  Will use GPU acceleration: %s", self.use_gpu)

    def _detect_nvidia_gpu(self) -> bool:
        """Detect if NVIDIA GPU is available."""
        try:
            # Try nvidia-smi command
            result = subprocess.run(
                ["nvidia-smi"], capture_output=True, text=True, timeout=10, check=False
            )
            return result.returncode == 0
        except (
            subprocess.TimeoutExpired,
            FileNotFoundError,
            subprocess.SubprocessError,
        ):
            return False

    def get_system_info(self) -> dict:
        """Get comprehensive system information."""
        return {
            "os_type": self.os_type,
            "architecture": self.architecture,
            "has_nvidia_gpu": self.has_nvidia_gpu,
            "use_gpu": self.use_gpu,
        }

    def is_gpu_compatible(self) -> bool:
        """Check if the system is compatible with GPU acceleration."""
        return self.has_nvidia_gpu and self.os_type != "darwin"

`_detect_nvidia_gpu()`

Detect if NVIDIA GPU is available.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def _detect_nvidia_gpu(self) -> bool:
    """Detect if NVIDIA GPU is available."""
    try:
        # Try nvidia-smi command
        result = subprocess.run(
            ["nvidia-smi"], capture_output=True, text=True, timeout=10, check=False
        )
        return result.returncode == 0
    except (
        subprocess.TimeoutExpired,
        FileNotFoundError,
        subprocess.SubprocessError,
    ):
        return False

`get_system_info()`

Get comprehensive system information.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def get_system_info(self) -> dict:
    """Get comprehensive system information."""
    return {
        "os_type": self.os_type,
        "architecture": self.architecture,
        "has_nvidia_gpu": self.has_nvidia_gpu,
        "use_gpu": self.use_gpu,
    }

`is_gpu_compatible()`

Check if the system is compatible with GPU acceleration.

Source code in aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py

def is_gpu_compatible(self) -> bool:
    """Check if the system is compatible with GPU acceleration."""
    return self.has_nvidia_gpu and self.os_type != "darwin"

PCST (Milvus Multimodal)

DynamicLibraryLoader

_import_libraries()

_setup_cpu_mode()

normalize_matrix(matrix, axis=1)

to_list(data)

MultimodalPCSTPruning

_compute_edge_prizes(text_emb, colls)

_compute_edge_prizes_async(text_emb, collection_name, connection_manager) async

_compute_node_prizes(query_emb, colls)

_compute_node_prizes_async(query_emb, collection_name, connection_manager, use_description=False) async

_process_edge_prizes(e_prizes, topk_e)

compute_prizes(text_emb, query_emb, colls)

compute_prizes_async(text_emb, query_emb, cfg, modality) async

compute_subgraph_costs(edge_index, num_nodes, prizes)

extract_subgraph(text_emb, query_emb, modality, cfg)

get_subgraph_nodes_edges(num_nodes, vertices, edges_dict, mapping)

load_edge_index(cfg)

load_edge_index_async(cfg, _connection_manager=None) async

prepare_collections(cfg, modality)

SystemDetector

_detect_nvidia_gpu()

get_system_info()

is_gpu_compatible()

`DynamicLibraryLoader`

`_import_libraries()`

`_setup_cpu_mode()`

`normalize_matrix(matrix, axis=1)`

`to_list(data)`

`MultimodalPCSTPruning`

`_compute_edge_prizes(text_emb, colls)`

`_compute_edge_prizes_async(text_emb, collection_name, connection_manager)` `async`

`_compute_node_prizes(query_emb, colls)`

`_compute_node_prizes_async(query_emb, collection_name, connection_manager, use_description=False)` `async`

`_process_edge_prizes(e_prizes, topk_e)`

`compute_prizes(text_emb, query_emb, colls)`

`compute_prizes_async(text_emb, query_emb, cfg, modality)` `async`

`compute_subgraph_costs(edge_index, num_nodes, prizes)`

`extract_subgraph(text_emb, query_emb, modality, cfg)`

`get_subgraph_nodes_edges(num_nodes, vertices, edges_dict, mapping)`

`load_edge_index(cfg)`

`load_edge_index_async(cfg, _connection_manager=None)` `async`

`prepare_collections(cfg, modality)`

`SystemDetector`

`_detect_nvidia_gpu()`

`get_system_info()`

`is_gpu_compatible()`