Skip to content

Tool Helper

Helper class for PDF Q&A tool orchestration: state validation, vectorstore init, paper loading, reranking, and answer formatting.

QAToolHelper

Encapsulates helper routines for the PDF Question & Answer tool.

Source code in aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
class QAToolHelper:
    """Encapsulates helper routines for the PDF Question & Answer tool."""

    def __init__(self) -> None:
        self.prebuilt_vector_store: Optional[Vectorstore] = None
        self.config: Any = None
        self.call_id: str = ""
        logger.debug("Initialized QAToolHelper")

    def start_call(self, config: Any, call_id: str) -> None:
        """Initialize helper with current config and call identifier."""
        self.config = config
        self.call_id = call_id
        logger.debug("QAToolHelper started call %s", call_id)

    def get_state_models_and_data(self, state: dict) -> tuple[Any, Any, Dict[str, Any]]:
        """Retrieve embedding model, LLM, and article data from agent state."""
        text_emb = state.get("text_embedding_model")
        if not text_emb:
            msg = "No text embedding model found in state."
            logger.error("%s: %s", self.call_id, msg)
            raise ValueError(msg)
        llm = state.get("llm_model")
        if not llm:
            msg = "No LLM model found in state."
            logger.error("%s: %s", self.call_id, msg)
            raise ValueError(msg)
        articles = state.get("article_data", {})
        if not articles:
            msg = "No article_data found in state."
            logger.error("%s: %s", self.call_id, msg)
            raise ValueError(msg)
        return text_emb, llm, articles

    def init_vector_store(self, emb_model: Any) -> Vectorstore:
        """Return shared or new Vectorstore instance."""
        if self.prebuilt_vector_store is not None:
            logger.info("Using shared pre-built vector store from memory")
            return self.prebuilt_vector_store
        vs = Vectorstore(embedding_model=emb_model, config=self.config)
        logger.info("Initialized new vector store with provided configuration")
        self.prebuilt_vector_store = vs
        return vs

    def load_candidate_papers(
        self,
        vs: Vectorstore,
        articles: Dict[str, Any],
        candidates: List[str],
    ) -> None:
        """Ensure each candidate paper is loaded into the vector store."""
        for pid in candidates:
            if pid not in vs.loaded_papers:
                pdf_url = articles.get(pid, {}).get("pdf_url")
                if not pdf_url:
                    continue
                try:
                    vs.add_paper(pid, pdf_url, articles[pid])
                except (IOError, ValueError) as exc:
                    logger.warning(
                        "%s: Error loading paper %s: %s", self.call_id, pid, exc
                    )

    def run_reranker(
        self,
        vs: Vectorstore,
        query: str,
        candidates: List[str],
    ) -> List[str]:
        """Rank papers by relevance and return filtered paper IDs."""
        try:
            ranked = rank_papers_by_query(
                vs, query, self.config, top_k=self.config.top_k_papers
            )
            logger.info("%s: Papers after NVIDIA reranking: %s", self.call_id, ranked)
            return [pid for pid in ranked if pid in candidates]
        except (ValueError, RuntimeError) as exc:
            logger.error("%s: NVIDIA reranker failed: %s", self.call_id, exc)
            logger.info(
                "%s: Falling back to all %d candidate papers",
                self.call_id,
                len(candidates),
            )
            return candidates

    def format_answer(
        self,
        question: str,
        chunks: List[Any],
        llm: Any,
        articles: Dict[str, Any],
    ) -> str:
        """Generate the final answer text with source attributions."""
        result = generate_answer(question, chunks, llm, self.config)
        answer = result.get("output_text", "No answer generated.")
        titles: Dict[str, str] = {}
        for pid in result.get("papers_used", []):
            if pid in articles:
                titles[pid] = articles[pid].get("Title", "Unknown paper")
        if titles:
            srcs = "\n\nSources:\n" + "\n".join(f"- {t}" for t in titles.values())
        else:
            srcs = ""
        logger.info(
            "%s: Generated answer using %d chunks from %d papers",
            self.call_id,
            len(chunks),
            len(titles),
        )
        return f"{answer}{srcs}"

format_answer(question, chunks, llm, articles)

Generate the final answer text with source attributions.

Source code in aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def format_answer(
    self,
    question: str,
    chunks: List[Any],
    llm: Any,
    articles: Dict[str, Any],
) -> str:
    """Generate the final answer text with source attributions."""
    result = generate_answer(question, chunks, llm, self.config)
    answer = result.get("output_text", "No answer generated.")
    titles: Dict[str, str] = {}
    for pid in result.get("papers_used", []):
        if pid in articles:
            titles[pid] = articles[pid].get("Title", "Unknown paper")
    if titles:
        srcs = "\n\nSources:\n" + "\n".join(f"- {t}" for t in titles.values())
    else:
        srcs = ""
    logger.info(
        "%s: Generated answer using %d chunks from %d papers",
        self.call_id,
        len(chunks),
        len(titles),
    )
    return f"{answer}{srcs}"

get_state_models_and_data(state)

Retrieve embedding model, LLM, and article data from agent state.

Source code in aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def get_state_models_and_data(self, state: dict) -> tuple[Any, Any, Dict[str, Any]]:
    """Retrieve embedding model, LLM, and article data from agent state."""
    text_emb = state.get("text_embedding_model")
    if not text_emb:
        msg = "No text embedding model found in state."
        logger.error("%s: %s", self.call_id, msg)
        raise ValueError(msg)
    llm = state.get("llm_model")
    if not llm:
        msg = "No LLM model found in state."
        logger.error("%s: %s", self.call_id, msg)
        raise ValueError(msg)
    articles = state.get("article_data", {})
    if not articles:
        msg = "No article_data found in state."
        logger.error("%s: %s", self.call_id, msg)
        raise ValueError(msg)
    return text_emb, llm, articles

init_vector_store(emb_model)

Return shared or new Vectorstore instance.

Source code in aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py
50
51
52
53
54
55
56
57
58
def init_vector_store(self, emb_model: Any) -> Vectorstore:
    """Return shared or new Vectorstore instance."""
    if self.prebuilt_vector_store is not None:
        logger.info("Using shared pre-built vector store from memory")
        return self.prebuilt_vector_store
    vs = Vectorstore(embedding_model=emb_model, config=self.config)
    logger.info("Initialized new vector store with provided configuration")
    self.prebuilt_vector_store = vs
    return vs

load_candidate_papers(vs, articles, candidates)

Ensure each candidate paper is loaded into the vector store.

Source code in aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def load_candidate_papers(
    self,
    vs: Vectorstore,
    articles: Dict[str, Any],
    candidates: List[str],
) -> None:
    """Ensure each candidate paper is loaded into the vector store."""
    for pid in candidates:
        if pid not in vs.loaded_papers:
            pdf_url = articles.get(pid, {}).get("pdf_url")
            if not pdf_url:
                continue
            try:
                vs.add_paper(pid, pdf_url, articles[pid])
            except (IOError, ValueError) as exc:
                logger.warning(
                    "%s: Error loading paper %s: %s", self.call_id, pid, exc
                )

run_reranker(vs, query, candidates)

Rank papers by relevance and return filtered paper IDs.

Source code in aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def run_reranker(
    self,
    vs: Vectorstore,
    query: str,
    candidates: List[str],
) -> List[str]:
    """Rank papers by relevance and return filtered paper IDs."""
    try:
        ranked = rank_papers_by_query(
            vs, query, self.config, top_k=self.config.top_k_papers
        )
        logger.info("%s: Papers after NVIDIA reranking: %s", self.call_id, ranked)
        return [pid for pid in ranked if pid in candidates]
    except (ValueError, RuntimeError) as exc:
        logger.error("%s: NVIDIA reranker failed: %s", self.call_id, exc)
        logger.info(
            "%s: Falling back to all %d candidate papers",
            self.call_id,
            len(candidates),
        )
        return candidates