Skip to content

Download Arxiv Input

Tool for downloading arXiv paper metadata and retrieving the PDF URL.

DownloadArxivPaperInput

Bases: BaseModel

Input schema for the arXiv paper download tool.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py
23
24
25
26
27
28
29
class DownloadArxivPaperInput(BaseModel):
    """Input schema for the arXiv paper download tool."""

    arxiv_ids: List[str] = Field(
        description="List of arXiv paper IDs used to retrieve paper details and PDF URLs."
    )
    tool_call_id: Annotated[str, InjectedToolCallId]

_get_arxiv_config()

Load arXiv download configuration.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py
33
34
35
36
37
38
39
def _get_arxiv_config() -> Any:
    """Load arXiv download configuration."""
    with hydra.initialize(version_base=None, config_path="../../configs"):
        cfg = hydra.compose(
            config_name="config", overrides=["tools/download_arxiv_paper=default"]
        )
    return cfg.tools.download_arxiv_paper

_get_snippet(abstract)

Extract the first one or two sentences from an abstract.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py
 95
 96
 97
 98
 99
100
101
102
103
104
def _get_snippet(abstract: str) -> str:
    """Extract the first one or two sentences from an abstract."""
    if not abstract or abstract == "N/A":
        return ""
    sentences = abstract.split(". ")
    snippet_sentences = sentences[:2]
    snippet = ". ".join(snippet_sentences)
    if not snippet.endswith("."):
        snippet += "."
    return snippet

download_arxiv_paper(arxiv_ids, tool_call_id)

Get metadata and PDF URLs for one or more arXiv papers using their unique arXiv IDs.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
@tool(
    args_schema=DownloadArxivPaperInput,
    parse_docstring=True,
)
def download_arxiv_paper(
    arxiv_ids: List[str],
    tool_call_id: Annotated[str, InjectedToolCallId],
) -> Command[Any]:
    """
    Get metadata and PDF URLs for one or more arXiv papers using their unique arXiv IDs.
    """
    logger.info("Fetching metadata from arXiv for paper IDs: %s", arxiv_ids)

    # Load configuration
    cfg = _get_arxiv_config()
    api_url = cfg.api_url
    request_timeout = cfg.request_timeout

    # Aggregate results
    article_data: dict[str, Any] = {}
    for aid in arxiv_ids:
        logger.info("Processing arXiv ID: %s", aid)
        # Fetch and parse metadata
        entry = fetch_arxiv_metadata(api_url, aid, request_timeout).find(
            "atom:entry", {"atom": "http://www.w3.org/2005/Atom"}
        )
        if entry is None:
            logger.warning("No entry found for arXiv ID %s", aid)
            continue
        article_data[aid] = extract_metadata(
            entry, {"atom": "http://www.w3.org/2005/Atom"}, aid
        )

    # Build and return summary
    content = _build_summary(article_data)
    return Command(
        update={
            "article_data": article_data,
            "messages": [
                ToolMessage(
                    content=content,
                    tool_call_id=tool_call_id,
                    artifact=article_data,
                )
            ],
        }
    )

extract_metadata(entry, ns, arxiv_id)

Extract metadata from the XML entry.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def extract_metadata(entry: ET.Element, ns: dict, arxiv_id: str) -> dict:
    """Extract metadata from the XML entry."""
    title_elem = entry.find("atom:title", ns)
    title = (title_elem.text or "").strip() if title_elem is not None else "N/A"

    authors = []
    for author_elem in entry.findall("atom:author", ns):
        name_elem = author_elem.find("atom:name", ns)
        if name_elem is not None and name_elem.text:
            authors.append(name_elem.text.strip())

    summary_elem = entry.find("atom:summary", ns)
    abstract = (summary_elem.text or "").strip() if summary_elem is not None else "N/A"

    published_elem = entry.find("atom:published", ns)
    pub_date = (
        (published_elem.text or "").strip() if published_elem is not None else "N/A"
    )

    pdf_url = next(
        (
            link.attrib.get("href")
            for link in entry.findall("atom:link", ns)
            if link.attrib.get("title") == "pdf"
        ),
        None,
    )
    if not pdf_url:
        raise RuntimeError(f"Could not find PDF URL for arXiv ID {arxiv_id}")

    return {
        "Title": title,
        "Authors": authors,
        "Abstract": abstract,
        "Publication Date": pub_date,
        "URL": pdf_url,
        "pdf_url": pdf_url,
        "filename": f"{arxiv_id}.pdf",
        "source": "arxiv",
        "arxiv_id": arxiv_id,
    }

fetch_arxiv_metadata(api_url, arxiv_id, request_timeout)

Fetch and parse metadata from the arXiv API.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py
42
43
44
45
46
47
48
49
def fetch_arxiv_metadata(
    api_url: str, arxiv_id: str, request_timeout: int
) -> ET.Element:
    """Fetch and parse metadata from the arXiv API."""
    query_url = f"{api_url}?search_query=id:{arxiv_id}&start=0&max_results=1"
    response = requests.get(query_url, timeout=request_timeout)
    response.raise_for_status()
    return ET.fromstring(response.text)