Arxiv Downloader

Arxiv Paper Downloader

This module provides an implementation of AbstractPaperDownloader for arXiv. It connects to the arXiv API, retrieves metadata for a research paper, and downloads the corresponding PDF.

By using an abstract base class, this implementation is extendable to other APIs like PubMed, IEEE Xplore, etc.

`ArxivPaperDownloader`

Bases: AbstractPaperDownloader

Downloader class for arXiv.

This class interfaces with the arXiv API to fetch metadata and retrieve PDFs of academic papers based on their arXiv IDs.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py

class ArxivPaperDownloader(AbstractPaperDownloader):
    """
    Downloader class for arXiv.

    This class interfaces with the arXiv API to fetch metadata
    and retrieve PDFs of academic papers based on their arXiv IDs.
    """

    def __init__(self):
        """
        Initializes the arXiv paper downloader.

        Uses Hydra for configuration management to retrieve API details.
        """
        with hydra.initialize(version_base=None, config_path="../../configs"):
            cfg = hydra.compose(
                config_name="config", overrides=["tools/download_arxiv_paper=default"]
            )
            self.api_url = cfg.tools.download_arxiv_paper.api_url
            self.request_timeout = cfg.tools.download_arxiv_paper.request_timeout
            self.chunk_size = cfg.tools.download_arxiv_paper.chunk_size
            self.pdf_base_url = cfg.tools.download_arxiv_paper.pdf_base_url

    def fetch_metadata(self, paper_id: str) -> Dict[str, Any]:
        """
        Fetch metadata from arXiv for a given paper ID.

        Args:
            paper_id (str): The arXiv ID of the paper.

        Returns:
            Dict[str, Any]: A dictionary containing metadata, including the XML response.
        """
        logger.info("Fetching metadata from arXiv for paper ID: %s", paper_id)
        api_url = f"{self.api_url}?search_query=id:{paper_id}&start=0&max_results=1"
        response = requests.get(api_url, timeout=self.request_timeout)
        response.raise_for_status()
        return {"xml": response.text}

    def download_pdf(self, paper_id: str) -> Dict[str, Any]:
        """
        Download the PDF of a paper from arXiv.

        This function first retrieves the paper's metadata to locate the PDF link
        before downloading the file.

        Args:
            paper_id (str): The arXiv ID of the paper.

        Returns:
            Dict[str, Any]: A dictionary containing:
                - `pdf_object`: The binary content of the downloaded PDF.
                - `pdf_url`: The URL from which the PDF was fetched.
                - `arxiv_id`: The arXiv ID of the downloaded paper.
        """
        metadata = self.fetch_metadata(paper_id)

        # Parse the XML response to locate the PDF link.
        root = ET.fromstring(metadata["xml"])
        ns = {"atom": "http://www.w3.org/2005/Atom"}
        pdf_url = next(
            (
                link.attrib.get("href")
                for entry in root.findall("atom:entry", ns)
                for link in entry.findall("atom:link", ns)
                if link.attrib.get("title") == "pdf"
            ),
            None,
        )

        if not pdf_url:
            raise RuntimeError(f"Failed to download PDF for arXiv ID {paper_id}.")

        logger.info("Downloading PDF from: %s", pdf_url)
        pdf_response = requests.get(pdf_url, stream=True, timeout=self.request_timeout)
        pdf_response.raise_for_status()
        # print (pdf_response)

        # Combine the PDF data from chunks.
        pdf_object = b"".join(
            chunk
            for chunk in pdf_response.iter_content(chunk_size=self.chunk_size)
            if chunk
        )
        # print (pdf_object)
        print("PDF_URL", pdf_url)

        return {
            "pdf_object": pdf_object,
            "pdf_url": pdf_url,
            "arxiv_id": paper_id,
        }

`init()`

Initializes the arXiv paper downloader.

Uses Hydra for configuration management to retrieve API details.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py

def __init__(self):
    """
    Initializes the arXiv paper downloader.

    Uses Hydra for configuration management to retrieve API details.
    """
    with hydra.initialize(version_base=None, config_path="../../configs"):
        cfg = hydra.compose(
            config_name="config", overrides=["tools/download_arxiv_paper=default"]
        )
        self.api_url = cfg.tools.download_arxiv_paper.api_url
        self.request_timeout = cfg.tools.download_arxiv_paper.request_timeout
        self.chunk_size = cfg.tools.download_arxiv_paper.chunk_size
        self.pdf_base_url = cfg.tools.download_arxiv_paper.pdf_base_url

`download_pdf(paper_id)`

Download the PDF of a paper from arXiv.

This function first retrieves the paper's metadata to locate the PDF link before downloading the file.

Parameters:

Name	Type	Description	Default
`paper_id`	`str`	The arXiv ID of the paper.	required

Returns:

Type	Description
`Dict[str, Any]`	Dict[str, Any]: A dictionary containing: - `pdf_object`: The binary content of the downloaded PDF. - `pdf_url`: The URL from which the PDF was fetched. - `arxiv_id`: The arXiv ID of the downloaded paper.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py

def download_pdf(self, paper_id: str) -> Dict[str, Any]:
    """
    Download the PDF of a paper from arXiv.

    This function first retrieves the paper's metadata to locate the PDF link
    before downloading the file.

    Args:
        paper_id (str): The arXiv ID of the paper.

    Returns:
        Dict[str, Any]: A dictionary containing:
            - `pdf_object`: The binary content of the downloaded PDF.
            - `pdf_url`: The URL from which the PDF was fetched.
            - `arxiv_id`: The arXiv ID of the downloaded paper.
    """
    metadata = self.fetch_metadata(paper_id)

    # Parse the XML response to locate the PDF link.
    root = ET.fromstring(metadata["xml"])
    ns = {"atom": "http://www.w3.org/2005/Atom"}
    pdf_url = next(
        (
            link.attrib.get("href")
            for entry in root.findall("atom:entry", ns)
            for link in entry.findall("atom:link", ns)
            if link.attrib.get("title") == "pdf"
        ),
        None,
    )

    if not pdf_url:
        raise RuntimeError(f"Failed to download PDF for arXiv ID {paper_id}.")

    logger.info("Downloading PDF from: %s", pdf_url)
    pdf_response = requests.get(pdf_url, stream=True, timeout=self.request_timeout)
    pdf_response.raise_for_status()
    # print (pdf_response)

    # Combine the PDF data from chunks.
    pdf_object = b"".join(
        chunk
        for chunk in pdf_response.iter_content(chunk_size=self.chunk_size)
        if chunk
    )
    # print (pdf_object)
    print("PDF_URL", pdf_url)

    return {
        "pdf_object": pdf_object,
        "pdf_url": pdf_url,
        "arxiv_id": paper_id,
    }

`fetch_metadata(paper_id)`

Fetch metadata from arXiv for a given paper ID.

Parameters:

Name	Type	Description	Default
`paper_id`	`str`	The arXiv ID of the paper.	required

Returns:

Type	Description
`Dict[str, Any]`	Dict[str, Any]: A dictionary containing metadata, including the XML response.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py

def fetch_metadata(self, paper_id: str) -> Dict[str, Any]:
    """
    Fetch metadata from arXiv for a given paper ID.

    Args:
        paper_id (str): The arXiv ID of the paper.

    Returns:
        Dict[str, Any]: A dictionary containing metadata, including the XML response.
    """
    logger.info("Fetching metadata from arXiv for paper ID: %s", paper_id)
    api_url = f"{self.api_url}?search_query=id:{paper_id}&start=0&max_results=1"
    response = requests.get(api_url, timeout=self.request_timeout)
    response.raise_for_status()
    return {"xml": response.text}

Arxiv Downloader

ArxivPaperDownloader

__init__()

download_pdf(paper_id)

fetch_metadata(paper_id)

`ArxivPaperDownloader`

`init()`

`download_pdf(paper_id)`

`fetch_metadata(paper_id)`