Skip to content

Arxiv Downloader

ArXiv paper downloader implementation.

ArxivDownloader

Bases: BasePaperDownloader

ArXiv-specific implementation of paper downloader.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
class ArxivDownloader(BasePaperDownloader):
    """ArXiv-specific implementation of paper downloader."""

    def __init__(self, config: Any):
        """Initialize ArXiv downloader with configuration."""
        super().__init__(config)
        self.api_url = config.api_url
        self.pdf_base_url = config.pdf_base_url
        # XML namespace configuration
        self.xml_namespaces = getattr(
            config, "xml_namespace", {"atom": "http://www.w3.org/2005/Atom"}
        )

    def fetch_metadata(self, identifier: str) -> ET.Element:
        """
        Fetch paper metadata from arXiv API.

        Args:
            identifier: arXiv ID (e.g., '1234.5678' or '2301.12345')

        Returns:
            XML root element from arXiv API response

        Raises:
            requests.RequestException: If API call fails
            RuntimeError: If no entry found in response
        """
        query_url = f"{self.api_url}?search_query=id:{identifier}&start=0&max_results=1"
        logger.info("Fetching metadata for arXiv ID %s from: %s", identifier, query_url)

        response = requests.get(query_url, timeout=self.request_timeout)
        response.raise_for_status()

        root = ET.fromstring(response.text)
        entry = root.find("atom:entry", self.xml_namespaces)

        if entry is None:
            raise RuntimeError("No entry found in arXiv API response")

        return root

    def construct_pdf_url(self, metadata: ET.Element, identifier: str) -> str:
        """
        Extract or construct PDF URL from arXiv metadata.

        Args:
            metadata: XML root from arXiv API
            identifier: arXiv ID

        Returns:
            PDF URL string
        """
        entry = metadata.find("atom:entry", self.xml_namespaces)

        if entry is None:
            return ""

        # Try to find PDF link in metadata first
        pdf_url = next(
            (
                link.attrib.get("href")
                for link in entry.findall("atom:link", self.xml_namespaces)
                if link.attrib.get("title") == "pdf"
            ),
            None,
        )

        # Fallback to constructed PDF URL if not found in metadata
        if not pdf_url:
            pdf_url = f"{self.pdf_base_url}/{identifier}.pdf"
            logger.info("Using constructed PDF URL for %s: %s", identifier, pdf_url)

        return pdf_url

    def extract_paper_metadata(
        self,
        metadata: ET.Element,
        identifier: str,
        pdf_result: Optional[Tuple[str, str]],
    ) -> Dict[str, Any]:
        """
        Extract structured metadata from arXiv API response.

        Args:
            metadata: XML root from arXiv API
            identifier: arXiv ID
            pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded

        Returns:
            Standardized paper metadata dictionary
        """
        entry = metadata.find("atom:entry", self.xml_namespaces)

        if entry is None:
            raise RuntimeError("No entry found in metadata")

        # Extract basic metadata
        basic_metadata = self._extract_basic_metadata(entry, self.xml_namespaces)

        # Handle PDF download results
        pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)

        # Combine all metadata
        return {
            **basic_metadata,
            **pdf_metadata,
            "source": "arxiv",
            "arxiv_id": identifier,
        }

    def _extract_basic_metadata(self, entry: ET.Element, ns: dict) -> Dict[str, Any]:
        """Extract basic metadata (title, authors, abstract, date) from entry."""
        title = self._extract_title(entry, ns)
        authors = self._extract_authors(entry, ns)
        abstract = self._extract_abstract(entry, ns)
        pub_date = self._extract_publication_date(entry, ns)

        return {
            "Title": title,
            "Authors": authors,
            "Abstract": abstract,
            "Publication Date": pub_date,
        }

    def _extract_title(self, entry: ET.Element, ns: dict) -> str:
        """Extract title from entry."""
        title_elem = entry.find("atom:title", ns)
        return (title_elem.text or "").strip() if title_elem is not None else "N/A"

    def _extract_authors(self, entry: ET.Element, ns: dict) -> list:
        """Extract authors from entry."""
        authors = []
        for author_elem in entry.findall("atom:author", ns):
            name_elem = author_elem.find("atom:name", ns)
            if name_elem is not None and name_elem.text:
                authors.append(name_elem.text.strip())
        return authors

    def _extract_abstract(self, entry: ET.Element, ns: dict) -> str:
        """Extract abstract from entry."""
        summary_elem = entry.find("atom:summary", ns)
        return (summary_elem.text or "").strip() if summary_elem is not None else "N/A"

    def _extract_publication_date(self, entry: ET.Element, ns: dict) -> str:
        """Extract publication date from entry."""
        published_elem = entry.find("atom:published", ns)
        return (
            (published_elem.text or "").strip() if published_elem is not None else "N/A"
        )

    def _extract_pdf_metadata(
        self, pdf_result: Optional[Tuple[str, str]], identifier: str
    ) -> Dict[str, Any]:
        """Extract PDF-related metadata."""
        if pdf_result:
            temp_file_path, filename = pdf_result
            return {
                "URL": temp_file_path,
                "pdf_url": temp_file_path,
                "filename": filename,
                "access_type": "open_access_downloaded",
                "temp_file_path": temp_file_path,
            }

        return {
            "URL": "",
            "pdf_url": "",
            "filename": self.get_default_filename(identifier),
            "access_type": "download_failed",
            "temp_file_path": "",
        }

    def get_service_name(self) -> str:
        """Return service name."""
        return "arXiv"

    def get_identifier_name(self) -> str:
        """Return identifier display name."""
        return "arXiv ID"

    def get_default_filename(self, identifier: str) -> str:
        """Generate default filename for arXiv paper."""
        return f"{identifier}.pdf"

    def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
        """Get arXiv-specific identifier info for paper summary."""
        arxiv_id = paper.get("arxiv_id", "N/A")
        pub_date = paper.get("Publication Date", "N/A")
        return f" (arXiv:{arxiv_id}, {pub_date})"

    def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
        """Add arXiv ID field to entry."""
        entry["arxiv_id"] = identifier

__init__(config)

Initialize ArXiv downloader with configuration.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
20
21
22
23
24
25
26
27
28
def __init__(self, config: Any):
    """Initialize ArXiv downloader with configuration."""
    super().__init__(config)
    self.api_url = config.api_url
    self.pdf_base_url = config.pdf_base_url
    # XML namespace configuration
    self.xml_namespaces = getattr(
        config, "xml_namespace", {"atom": "http://www.w3.org/2005/Atom"}
    )

_add_service_identifier(entry, identifier)

Add arXiv ID field to entry.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
207
208
209
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
    """Add arXiv ID field to entry."""
    entry["arxiv_id"] = identifier

_extract_abstract(entry, ns)

Extract abstract from entry.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
155
156
157
158
def _extract_abstract(self, entry: ET.Element, ns: dict) -> str:
    """Extract abstract from entry."""
    summary_elem = entry.find("atom:summary", ns)
    return (summary_elem.text or "").strip() if summary_elem is not None else "N/A"

_extract_authors(entry, ns)

Extract authors from entry.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
146
147
148
149
150
151
152
153
def _extract_authors(self, entry: ET.Element, ns: dict) -> list:
    """Extract authors from entry."""
    authors = []
    for author_elem in entry.findall("atom:author", ns):
        name_elem = author_elem.find("atom:name", ns)
        if name_elem is not None and name_elem.text:
            authors.append(name_elem.text.strip())
    return authors

_extract_basic_metadata(entry, ns)

Extract basic metadata (title, authors, abstract, date) from entry.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
127
128
129
130
131
132
133
134
135
136
137
138
139
def _extract_basic_metadata(self, entry: ET.Element, ns: dict) -> Dict[str, Any]:
    """Extract basic metadata (title, authors, abstract, date) from entry."""
    title = self._extract_title(entry, ns)
    authors = self._extract_authors(entry, ns)
    abstract = self._extract_abstract(entry, ns)
    pub_date = self._extract_publication_date(entry, ns)

    return {
        "Title": title,
        "Authors": authors,
        "Abstract": abstract,
        "Publication Date": pub_date,
    }

_extract_pdf_metadata(pdf_result, identifier)

Extract PDF-related metadata.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def _extract_pdf_metadata(
    self, pdf_result: Optional[Tuple[str, str]], identifier: str
) -> Dict[str, Any]:
    """Extract PDF-related metadata."""
    if pdf_result:
        temp_file_path, filename = pdf_result
        return {
            "URL": temp_file_path,
            "pdf_url": temp_file_path,
            "filename": filename,
            "access_type": "open_access_downloaded",
            "temp_file_path": temp_file_path,
        }

    return {
        "URL": "",
        "pdf_url": "",
        "filename": self.get_default_filename(identifier),
        "access_type": "download_failed",
        "temp_file_path": "",
    }

_extract_publication_date(entry, ns)

Extract publication date from entry.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
160
161
162
163
164
165
def _extract_publication_date(self, entry: ET.Element, ns: dict) -> str:
    """Extract publication date from entry."""
    published_elem = entry.find("atom:published", ns)
    return (
        (published_elem.text or "").strip() if published_elem is not None else "N/A"
    )

_extract_title(entry, ns)

Extract title from entry.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
141
142
143
144
def _extract_title(self, entry: ET.Element, ns: dict) -> str:
    """Extract title from entry."""
    title_elem = entry.find("atom:title", ns)
    return (title_elem.text or "").strip() if title_elem is not None else "N/A"

_get_paper_identifier_info(paper)

Get arXiv-specific identifier info for paper summary.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
201
202
203
204
205
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
    """Get arXiv-specific identifier info for paper summary."""
    arxiv_id = paper.get("arxiv_id", "N/A")
    pub_date = paper.get("Publication Date", "N/A")
    return f" (arXiv:{arxiv_id}, {pub_date})"

construct_pdf_url(metadata, identifier)

Extract or construct PDF URL from arXiv metadata.

Parameters:

Name Type Description Default
metadata Element

XML root from arXiv API

required
identifier str

arXiv ID

required

Returns:

Type Description
str

PDF URL string

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def construct_pdf_url(self, metadata: ET.Element, identifier: str) -> str:
    """
    Extract or construct PDF URL from arXiv metadata.

    Args:
        metadata: XML root from arXiv API
        identifier: arXiv ID

    Returns:
        PDF URL string
    """
    entry = metadata.find("atom:entry", self.xml_namespaces)

    if entry is None:
        return ""

    # Try to find PDF link in metadata first
    pdf_url = next(
        (
            link.attrib.get("href")
            for link in entry.findall("atom:link", self.xml_namespaces)
            if link.attrib.get("title") == "pdf"
        ),
        None,
    )

    # Fallback to constructed PDF URL if not found in metadata
    if not pdf_url:
        pdf_url = f"{self.pdf_base_url}/{identifier}.pdf"
        logger.info("Using constructed PDF URL for %s: %s", identifier, pdf_url)

    return pdf_url

extract_paper_metadata(metadata, identifier, pdf_result)

Extract structured metadata from arXiv API response.

Parameters:

Name Type Description Default
metadata Element

XML root from arXiv API

required
identifier str

arXiv ID

required
pdf_result Optional[Tuple[str, str]]

Tuple of (temp_file_path, filename) if PDF downloaded

required

Returns:

Type Description
Dict[str, Any]

Standardized paper metadata dictionary

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def extract_paper_metadata(
    self,
    metadata: ET.Element,
    identifier: str,
    pdf_result: Optional[Tuple[str, str]],
) -> Dict[str, Any]:
    """
    Extract structured metadata from arXiv API response.

    Args:
        metadata: XML root from arXiv API
        identifier: arXiv ID
        pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded

    Returns:
        Standardized paper metadata dictionary
    """
    entry = metadata.find("atom:entry", self.xml_namespaces)

    if entry is None:
        raise RuntimeError("No entry found in metadata")

    # Extract basic metadata
    basic_metadata = self._extract_basic_metadata(entry, self.xml_namespaces)

    # Handle PDF download results
    pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)

    # Combine all metadata
    return {
        **basic_metadata,
        **pdf_metadata,
        "source": "arxiv",
        "arxiv_id": identifier,
    }

fetch_metadata(identifier)

Fetch paper metadata from arXiv API.

Parameters:

Name Type Description Default
identifier str

arXiv ID (e.g., '1234.5678' or '2301.12345')

required

Returns:

Type Description
Element

XML root element from arXiv API response

Raises:

Type Description
RequestException

If API call fails

RuntimeError

If no entry found in response

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def fetch_metadata(self, identifier: str) -> ET.Element:
    """
    Fetch paper metadata from arXiv API.

    Args:
        identifier: arXiv ID (e.g., '1234.5678' or '2301.12345')

    Returns:
        XML root element from arXiv API response

    Raises:
        requests.RequestException: If API call fails
        RuntimeError: If no entry found in response
    """
    query_url = f"{self.api_url}?search_query=id:{identifier}&start=0&max_results=1"
    logger.info("Fetching metadata for arXiv ID %s from: %s", identifier, query_url)

    response = requests.get(query_url, timeout=self.request_timeout)
    response.raise_for_status()

    root = ET.fromstring(response.text)
    entry = root.find("atom:entry", self.xml_namespaces)

    if entry is None:
        raise RuntimeError("No entry found in arXiv API response")

    return root

get_default_filename(identifier)

Generate default filename for arXiv paper.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
197
198
199
def get_default_filename(self, identifier: str) -> str:
    """Generate default filename for arXiv paper."""
    return f"{identifier}.pdf"

get_identifier_name()

Return identifier display name.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
193
194
195
def get_identifier_name(self) -> str:
    """Return identifier display name."""
    return "arXiv ID"

get_service_name()

Return service name.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py
189
190
191
def get_service_name(self) -> str:
    """Return service name."""
    return "arXiv"