Arxiv Paper Downloader
This module provides an implementation of AbstractPaperDownloader
for arXiv.
It connects to the arXiv API, retrieves metadata for a research paper, and
downloads the corresponding PDF.
By using an abstract base class, this implementation is extendable to other
APIs like PubMed, IEEE Xplore, etc.
Bases: AbstractPaperDownloader
Downloader class for arXiv.
This class interfaces with the arXiv API to fetch metadata
and retrieve PDFs of academic papers based on their arXiv IDs.
Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115 | class ArxivPaperDownloader(AbstractPaperDownloader):
"""
Downloader class for arXiv.
This class interfaces with the arXiv API to fetch metadata
and retrieve PDFs of academic papers based on their arXiv IDs.
"""
def __init__(self):
"""
Initializes the arXiv paper downloader.
Uses Hydra for configuration management to retrieve API details.
"""
with hydra.initialize(version_base=None, config_path="../../configs"):
cfg = hydra.compose(
config_name="config", overrides=["tools/download_arxiv_paper=default"]
)
self.api_url = cfg.tools.download_arxiv_paper.api_url
self.request_timeout = cfg.tools.download_arxiv_paper.request_timeout
self.chunk_size = cfg.tools.download_arxiv_paper.chunk_size
self.pdf_base_url = cfg.tools.download_arxiv_paper.pdf_base_url
def fetch_metadata(self, paper_id: str) -> Dict[str, Any]:
"""
Fetch metadata from arXiv for a given paper ID.
Args:
paper_id (str): The arXiv ID of the paper.
Returns:
Dict[str, Any]: A dictionary containing metadata, including the XML response.
"""
logger.info("Fetching metadata from arXiv for paper ID: %s", paper_id)
api_url = f"{self.api_url}?search_query=id:{paper_id}&start=0&max_results=1"
response = requests.get(api_url, timeout=self.request_timeout)
response.raise_for_status()
return {"xml": response.text}
def download_pdf(self, paper_id: str) -> Dict[str, Any]:
"""
Download the PDF of a paper from arXiv.
This function first retrieves the paper's metadata to locate the PDF link
before downloading the file.
Args:
paper_id (str): The arXiv ID of the paper.
Returns:
Dict[str, Any]: A dictionary containing:
- `pdf_object`: The binary content of the downloaded PDF.
- `pdf_url`: The URL from which the PDF was fetched.
- `arxiv_id`: The arXiv ID of the downloaded paper.
"""
metadata = self.fetch_metadata(paper_id)
# Parse the XML response to locate the PDF link.
root = ET.fromstring(metadata["xml"])
ns = {"atom": "http://www.w3.org/2005/Atom"}
pdf_url = next(
(
link.attrib.get("href")
for entry in root.findall("atom:entry", ns)
for link in entry.findall("atom:link", ns)
if link.attrib.get("title") == "pdf"
),
None,
)
if not pdf_url:
raise RuntimeError(f"Failed to download PDF for arXiv ID {paper_id}.")
logger.info("Downloading PDF from: %s", pdf_url)
pdf_response = requests.get(pdf_url, stream=True, timeout=self.request_timeout)
pdf_response.raise_for_status()
# print (pdf_response)
# Combine the PDF data from chunks.
pdf_object = b"".join(
chunk
for chunk in pdf_response.iter_content(chunk_size=self.chunk_size)
if chunk
)
# print (pdf_object)
print("PDF_URL", pdf_url)
return {
"pdf_object": pdf_object,
"pdf_url": pdf_url,
"arxiv_id": paper_id,
}
|
Initializes the arXiv paper downloader.
Uses Hydra for configuration management to retrieve API details.
Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45 | def __init__(self):
"""
Initializes the arXiv paper downloader.
Uses Hydra for configuration management to retrieve API details.
"""
with hydra.initialize(version_base=None, config_path="../../configs"):
cfg = hydra.compose(
config_name="config", overrides=["tools/download_arxiv_paper=default"]
)
self.api_url = cfg.tools.download_arxiv_paper.api_url
self.request_timeout = cfg.tools.download_arxiv_paper.request_timeout
self.chunk_size = cfg.tools.download_arxiv_paper.chunk_size
self.pdf_base_url = cfg.tools.download_arxiv_paper.pdf_base_url
|
Download the PDF of a paper from arXiv.
This function first retrieves the paper's metadata to locate the PDF link
before downloading the file.
Parameters:
Name |
Type |
Description |
Default |
paper_id
|
str
|
The arXiv ID of the paper.
|
required
|
Returns:
Type |
Description |
Dict[str, Any]
|
Dict[str, Any]: A dictionary containing:
- pdf_object : The binary content of the downloaded PDF.
- pdf_url : The URL from which the PDF was fetched.
- arxiv_id : The arXiv ID of the downloaded paper.
|
Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115 | def download_pdf(self, paper_id: str) -> Dict[str, Any]:
"""
Download the PDF of a paper from arXiv.
This function first retrieves the paper's metadata to locate the PDF link
before downloading the file.
Args:
paper_id (str): The arXiv ID of the paper.
Returns:
Dict[str, Any]: A dictionary containing:
- `pdf_object`: The binary content of the downloaded PDF.
- `pdf_url`: The URL from which the PDF was fetched.
- `arxiv_id`: The arXiv ID of the downloaded paper.
"""
metadata = self.fetch_metadata(paper_id)
# Parse the XML response to locate the PDF link.
root = ET.fromstring(metadata["xml"])
ns = {"atom": "http://www.w3.org/2005/Atom"}
pdf_url = next(
(
link.attrib.get("href")
for entry in root.findall("atom:entry", ns)
for link in entry.findall("atom:link", ns)
if link.attrib.get("title") == "pdf"
),
None,
)
if not pdf_url:
raise RuntimeError(f"Failed to download PDF for arXiv ID {paper_id}.")
logger.info("Downloading PDF from: %s", pdf_url)
pdf_response = requests.get(pdf_url, stream=True, timeout=self.request_timeout)
pdf_response.raise_for_status()
# print (pdf_response)
# Combine the PDF data from chunks.
pdf_object = b"".join(
chunk
for chunk in pdf_response.iter_content(chunk_size=self.chunk_size)
if chunk
)
# print (pdf_object)
print("PDF_URL", pdf_url)
return {
"pdf_object": pdf_object,
"pdf_url": pdf_url,
"arxiv_id": paper_id,
}
|
Fetch metadata from arXiv for a given paper ID.
Parameters:
Name |
Type |
Description |
Default |
paper_id
|
str
|
The arXiv ID of the paper.
|
required
|
Returns:
Type |
Description |
Dict[str, Any]
|
Dict[str, Any]: A dictionary containing metadata, including the XML response.
|
Source code in aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61 | def fetch_metadata(self, paper_id: str) -> Dict[str, Any]:
"""
Fetch metadata from arXiv for a given paper ID.
Args:
paper_id (str): The arXiv ID of the paper.
Returns:
Dict[str, Any]: A dictionary containing metadata, including the XML response.
"""
logger.info("Fetching metadata from arXiv for paper ID: %s", paper_id)
api_url = f"{self.api_url}?search_query=id:{paper_id}&start=0&max_results=1"
response = requests.get(api_url, timeout=self.request_timeout)
response.raise_for_status()
return {"xml": response.text}
|