Skip to content

Arxiv Downloader

Arxiv Paper Downloader

This module provides an implementation of AbstractPaperDownloader for arXiv. It connects to the arXiv API, retrieves metadata for a research paper, and downloads the corresponding PDF.

By using an abstract base class, this implementation is extendable to other APIs like PubMed, IEEE Xplore, etc.

ArxivPaperDownloader

Bases: AbstractPaperDownloader

Downloader class for arXiv.

This class interfaces with the arXiv API to fetch metadata and retrieve PDFs of academic papers based on their arXiv IDs.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class ArxivPaperDownloader(AbstractPaperDownloader):
    """
    Downloader class for arXiv.

    This class interfaces with the arXiv API to fetch metadata
    and retrieve PDFs of academic papers based on their arXiv IDs.
    """

    def __init__(self):
        """
        Initializes the arXiv paper downloader.

        Uses Hydra for configuration management to retrieve API details.
        """
        with hydra.initialize(version_base=None, config_path="../../configs"):
            cfg = hydra.compose(
                config_name="config", overrides=["tools/download_arxiv_paper=default"]
            )
            self.api_url = cfg.tools.download_arxiv_paper.api_url
            self.request_timeout = cfg.tools.download_arxiv_paper.request_timeout
            self.chunk_size = cfg.tools.download_arxiv_paper.chunk_size
            self.pdf_base_url = cfg.tools.download_arxiv_paper.pdf_base_url

    def fetch_metadata(self, paper_id: str) -> Dict[str, Any]:
        """
        Fetch metadata from arXiv for a given paper ID.

        Args:
            paper_id (str): The arXiv ID of the paper.

        Returns:
            Dict[str, Any]: A dictionary containing metadata, including the XML response.
        """
        logger.info("Fetching metadata from arXiv for paper ID: %s", paper_id)
        api_url = f"{self.api_url}?search_query=id:{paper_id}&start=0&max_results=1"
        response = requests.get(api_url, timeout=self.request_timeout)
        response.raise_for_status()
        return {"xml": response.text}

    def download_pdf(self, paper_id: str) -> Dict[str, Any]:
        """
        Download the PDF of a paper from arXiv.

        This function first retrieves the paper's metadata to locate the PDF link
        before downloading the file.

        Args:
            paper_id (str): The arXiv ID of the paper.

        Returns:
            Dict[str, Any]: A dictionary containing:
                - `pdf_object`: The binary content of the downloaded PDF.
                - `pdf_url`: The URL from which the PDF was fetched.
                - `arxiv_id`: The arXiv ID of the downloaded paper.
        """
        metadata = self.fetch_metadata(paper_id)

        # Parse the XML response to locate the PDF link.
        root = ET.fromstring(metadata["xml"])
        ns = {"atom": "http://www.w3.org/2005/Atom"}
        pdf_url = next(
            (
                link.attrib.get("href")
                for entry in root.findall("atom:entry", ns)
                for link in entry.findall("atom:link", ns)
                if link.attrib.get("title") == "pdf"
            ),
            None,
        )

        if not pdf_url:
            raise RuntimeError(f"Failed to download PDF for arXiv ID {paper_id}.")

        logger.info("Downloading PDF from: %s", pdf_url)
        pdf_response = requests.get(pdf_url, stream=True, timeout=self.request_timeout)
        pdf_response.raise_for_status()
        # print (pdf_response)

        # Combine the PDF data from chunks.
        pdf_object = b"".join(
            chunk
            for chunk in pdf_response.iter_content(chunk_size=self.chunk_size)
            if chunk
        )
        # print (pdf_object)
        print("PDF_URL", pdf_url)

        return {
            "pdf_object": pdf_object,
            "pdf_url": pdf_url,
            "arxiv_id": paper_id,
        }

__init__()

Initializes the arXiv paper downloader.

Uses Hydra for configuration management to retrieve API details.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(self):
    """
    Initializes the arXiv paper downloader.

    Uses Hydra for configuration management to retrieve API details.
    """
    with hydra.initialize(version_base=None, config_path="../../configs"):
        cfg = hydra.compose(
            config_name="config", overrides=["tools/download_arxiv_paper=default"]
        )
        self.api_url = cfg.tools.download_arxiv_paper.api_url
        self.request_timeout = cfg.tools.download_arxiv_paper.request_timeout
        self.chunk_size = cfg.tools.download_arxiv_paper.chunk_size
        self.pdf_base_url = cfg.tools.download_arxiv_paper.pdf_base_url

download_pdf(paper_id)

Download the PDF of a paper from arXiv.

This function first retrieves the paper's metadata to locate the PDF link before downloading the file.

Parameters:

Name Type Description Default
paper_id str

The arXiv ID of the paper.

required

Returns:

Type Description
Dict[str, Any]

Dict[str, Any]: A dictionary containing: - pdf_object: The binary content of the downloaded PDF. - pdf_url: The URL from which the PDF was fetched. - arxiv_id: The arXiv ID of the downloaded paper.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def download_pdf(self, paper_id: str) -> Dict[str, Any]:
    """
    Download the PDF of a paper from arXiv.

    This function first retrieves the paper's metadata to locate the PDF link
    before downloading the file.

    Args:
        paper_id (str): The arXiv ID of the paper.

    Returns:
        Dict[str, Any]: A dictionary containing:
            - `pdf_object`: The binary content of the downloaded PDF.
            - `pdf_url`: The URL from which the PDF was fetched.
            - `arxiv_id`: The arXiv ID of the downloaded paper.
    """
    metadata = self.fetch_metadata(paper_id)

    # Parse the XML response to locate the PDF link.
    root = ET.fromstring(metadata["xml"])
    ns = {"atom": "http://www.w3.org/2005/Atom"}
    pdf_url = next(
        (
            link.attrib.get("href")
            for entry in root.findall("atom:entry", ns)
            for link in entry.findall("atom:link", ns)
            if link.attrib.get("title") == "pdf"
        ),
        None,
    )

    if not pdf_url:
        raise RuntimeError(f"Failed to download PDF for arXiv ID {paper_id}.")

    logger.info("Downloading PDF from: %s", pdf_url)
    pdf_response = requests.get(pdf_url, stream=True, timeout=self.request_timeout)
    pdf_response.raise_for_status()
    # print (pdf_response)

    # Combine the PDF data from chunks.
    pdf_object = b"".join(
        chunk
        for chunk in pdf_response.iter_content(chunk_size=self.chunk_size)
        if chunk
    )
    # print (pdf_object)
    print("PDF_URL", pdf_url)

    return {
        "pdf_object": pdf_object,
        "pdf_url": pdf_url,
        "arxiv_id": paper_id,
    }

fetch_metadata(paper_id)

Fetch metadata from arXiv for a given paper ID.

Parameters:

Name Type Description Default
paper_id str

The arXiv ID of the paper.

required

Returns:

Type Description
Dict[str, Any]

Dict[str, Any]: A dictionary containing metadata, including the XML response.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def fetch_metadata(self, paper_id: str) -> Dict[str, Any]:
    """
    Fetch metadata from arXiv for a given paper ID.

    Args:
        paper_id (str): The arXiv ID of the paper.

    Returns:
        Dict[str, Any]: A dictionary containing metadata, including the XML response.
    """
    logger.info("Fetching metadata from arXiv for paper ID: %s", paper_id)
    api_url = f"{self.api_url}?search_query=id:{paper_id}&start=0&max_results=1"
    response = requests.get(api_url, timeout=self.request_timeout)
    response.raise_for_status()
    return {"xml": response.text}