Skip to content

Base Paper Downloader

Abstract base class for paper download tools. Provides common functionality for arXiv, medRxiv, PubMed, and future paper sources.

BasePaperDownloader

Bases: ABC

Abstract base class for paper download tools.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
class BasePaperDownloader(ABC):
    """Abstract base class for paper download tools."""

    def __init__(self, config: Any):
        """Initialize with service-specific configuration."""
        self.config = config
        self.request_timeout = getattr(config, "request_timeout", 15)
        self.chunk_size = getattr(config, "chunk_size", 8192)
        self.user_agent = getattr(
            config, "user_agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
        )

    # Abstract methods that each service must implement
    @abstractmethod
    def fetch_metadata(self, identifier: str) -> Any:
        """
        Fetch paper metadata from the service API.

        Args:
            identifier: Paper identifier (arXiv ID, DOI, PMID, etc.)

        Returns:
            Service-specific metadata object (XML, JSON, etc.)
        """
        raise NotImplementedError

    @abstractmethod
    def construct_pdf_url(self, metadata: Any, identifier: str) -> str:
        """
        Construct or extract PDF URL from metadata.

        Args:
            metadata: Metadata returned from fetch_metadata()
            identifier: Original paper identifier

        Returns:
            PDF URL string (empty if not available)
        """
        raise NotImplementedError

    @abstractmethod
    def extract_paper_metadata(
        self, metadata: Any, identifier: str, pdf_result: Optional[Tuple[str, str]]
    ) -> Dict[str, Any]:
        """
        Extract and structure metadata into standardized format.

        Args:
            metadata: Raw metadata from API
            identifier: Original paper identifier
            pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded

        Returns:
            Standardized paper metadata dictionary
        """
        raise NotImplementedError

    @abstractmethod
    def get_service_name(self) -> str:
        """Return service name (e.g., 'arxiv', 'medrxiv', 'pubmed')."""
        raise NotImplementedError

    @abstractmethod
    def get_identifier_name(self) -> str:
        """Return identifier display name (e.g., 'arXiv ID', 'DOI', 'PMID')."""
        raise NotImplementedError

    @abstractmethod
    def get_default_filename(self, identifier: str) -> str:
        """Generate default filename for the paper PDF."""
        raise NotImplementedError

    # Common methods shared by all services
    def download_pdf_to_temp(
        self, pdf_url: str, identifier: str
    ) -> Optional[Tuple[str, str]]:
        """
        Download PDF from URL to a temporary file.

        Args:
            pdf_url: URL to download PDF from
            identifier: Paper identifier for logging

        Returns:
            Tuple of (temp_file_path, filename) or None if failed
        """
        if not pdf_url:
            logger.info(
                "No PDF URL available for %s %s", self.get_identifier_name(), identifier
            )
            return None

        try:
            logger.info(
                "Downloading PDF for %s %s from %s",
                self.get_identifier_name(),
                identifier,
                pdf_url,
            )

            headers = {"User-Agent": self.user_agent}
            response = requests.get(
                pdf_url, headers=headers, timeout=self.request_timeout, stream=True
            )
            response.raise_for_status()

            # Download to temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
                for chunk in response.iter_content(chunk_size=self.chunk_size):
                    if chunk:  # Filter out keep-alive chunks
                        temp_file.write(chunk)
                temp_file_path = temp_file.name

            logger.info(
                "%s PDF downloaded to temporary file: %s",
                self.get_service_name(),
                temp_file_path,
            )

            # Try to extract filename from Content-Disposition header
            filename = self.get_default_filename(identifier)
            content_disposition = response.headers.get("Content-Disposition", "")

            if "filename=" in content_disposition:
                try:

                    filename_match = re.search(
                        r'filename[*]?=(?:"([^"]+)"|([^;]+))', content_disposition
                    )
                    if filename_match:
                        extracted_filename = filename_match.group(
                            1
                        ) or filename_match.group(2)
                        extracted_filename = extracted_filename.strip().strip('"')
                        if extracted_filename and extracted_filename.endswith(".pdf"):
                            filename = extracted_filename
                            logger.info("Extracted filename from header: %s", filename)
                except requests.RequestException as e:
                    logger.warning("Failed to extract filename from header: %s", e)

            return temp_file_path, filename

        except (requests.exceptions.RequestException, OSError) as e:
            logger.error(
                "Failed to download PDF for %s %s: %s",
                self.get_identifier_name(),
                identifier,
                e,
            )
            return None

    def get_snippet(self, abstract: str) -> str:
        """
        Extract the first one or two sentences from an abstract.

        Args:
            abstract: Full abstract text

        Returns:
            Snippet of first 1-2 sentences
        """
        if not abstract or abstract == "N/A":
            return ""

        sentences = abstract.split(". ")
        snippet_sentences = sentences[:2]
        snippet = ". ".join(snippet_sentences)

        if not snippet.endswith("."):
            snippet += "."

        return snippet

    def create_error_entry(self, identifier: str, error_msg: str) -> Dict[str, Any]:
        """
        Create standardized error entry for failed paper processing.

        Args:
            identifier: Paper identifier
            error_msg: Error message

        Returns:
            Error entry dictionary
        """
        return {
            "Title": "Error fetching paper",
            "Authors": [],
            "Abstract": f"Error: {error_msg}",
            "Publication Date": "N/A",
            "URL": "",
            "pdf_url": "",
            "filename": self.get_default_filename(identifier),
            "source": self.get_service_name(),
            "access_type": "error",
            "temp_file_path": "",
            "error": error_msg,
            # Service-specific identifier field will be added by subclasses
        }

    def build_summary(self, article_data: Dict[str, Any]) -> str:
        """
        Build a summary string for up to three papers with snippets.

        Args:
            article_data: Dictionary of paper data keyed by identifier

        Returns:
            Formatted summary string
        """
        top = list(article_data.values())[:3]
        lines: List[str] = []
        downloaded_count = sum(
            1
            for paper in article_data.values()
            if paper.get("access_type") == "open_access_downloaded"
        )

        for idx, paper in enumerate(top):
            title = paper.get("Title", "N/A")
            access_type = paper.get("access_type", "N/A")
            temp_file_path = paper.get("temp_file_path", "")
            snippet = self.get_snippet(paper.get("Abstract", ""))

            # Build paper line with service-specific identifier info
            line = f"{idx+1}. {title}"
            line += self._get_paper_identifier_info(paper)
            line += f"\n   Access: {access_type}"

            if temp_file_path:
                line += f"\n   Downloaded to: {temp_file_path}"
            if snippet:
                line += f"\n   Abstract snippet: {snippet}"

            lines.append(line)

        summary = "\n".join(lines)
        service_name = self.get_service_name()

        return (
            f"Download was successful from {service_name}. "
            "Papers metadata are attached as an artifact. "
            "Here is a summary of the results:\n"
            f"Number of papers found: {len(article_data)}\n"
            f"PDFs successfully downloaded: {downloaded_count}\n"
            "Top 3 papers:\n" + summary
        )

    @abstractmethod
    def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
        """
        Get service-specific identifier info for paper summary.

        Args:
            paper: Paper metadata dictionary

        Returns:
            Formatted identifier string (e.g., " (arXiv:1234.5678, 2023-01-01)")
        """
        raise NotImplementedError

    def process_identifiers(self, identifiers: List[str]) -> Dict[str, Any]:
        """
        Main processing loop for downloading papers.

        Args:
            identifiers: List of paper identifiers

        Returns:
            Dictionary of paper data keyed by identifier
        """
        logger.info(
            "Processing %d identifiers from %s: %s",
            len(identifiers),
            self.get_service_name(),
            identifiers,
        )

        article_data: Dict[str, Any] = {}

        for identifier in identifiers:
            logger.info("Processing %s: %s", self.get_identifier_name(), identifier)

            try:
                # Step 1: Fetch metadata
                metadata = self.fetch_metadata(identifier)

                # Step 2: Extract PDF URL
                pdf_url = self.construct_pdf_url(metadata, identifier)

                # Step 3: Download PDF if available
                pdf_result = None
                if pdf_url:
                    pdf_result = self.download_pdf_to_temp(pdf_url, identifier)

                # Step 4: Extract and structure metadata
                article_data[identifier] = self.extract_paper_metadata(
                    metadata, identifier, pdf_result
                )

            except requests.RequestException as e:
                logger.warning(
                    "Error processing %s %s: %s",
                    self.get_identifier_name(),
                    identifier,
                    str(e),
                )

                # Create error entry
                error_entry = self.create_error_entry(identifier, str(e))
                # Add service-specific identifier field
                self._add_service_identifier(error_entry, identifier)
                article_data[identifier] = error_entry

        return article_data

    @abstractmethod
    def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
        """
        Add service-specific identifier field to entry.

        Args:
            entry: Paper entry dictionary to modify
            identifier: Original identifier
        """
        raise NotImplementedError

__init__(config)

Initialize with service-specific configuration.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
22
23
24
25
26
27
28
29
def __init__(self, config: Any):
    """Initialize with service-specific configuration."""
    self.config = config
    self.request_timeout = getattr(config, "request_timeout", 15)
    self.chunk_size = getattr(config, "chunk_size", 8192)
    self.user_agent = getattr(
        config, "user_agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
    )

_add_service_identifier(entry, identifier) abstractmethod

Add service-specific identifier field to entry.

Parameters:

Name Type Description Default
entry Dict[str, Any]

Paper entry dictionary to modify

required
identifier str

Original identifier

required
Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
334
335
336
337
338
339
340
341
342
343
@abstractmethod
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
    """
    Add service-specific identifier field to entry.

    Args:
        entry: Paper entry dictionary to modify
        identifier: Original identifier
    """
    raise NotImplementedError

_get_paper_identifier_info(paper) abstractmethod

Get service-specific identifier info for paper summary.

Parameters:

Name Type Description Default
paper Dict[str, Any]

Paper metadata dictionary

required

Returns:

Type Description
str

Formatted identifier string (e.g., " (arXiv:1234.5678, 2023-01-01)")

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
266
267
268
269
270
271
272
273
274
275
276
277
@abstractmethod
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
    """
    Get service-specific identifier info for paper summary.

    Args:
        paper: Paper metadata dictionary

    Returns:
        Formatted identifier string (e.g., " (arXiv:1234.5678, 2023-01-01)")
    """
    raise NotImplementedError

construct_pdf_url(metadata, identifier) abstractmethod

Construct or extract PDF URL from metadata.

Parameters:

Name Type Description Default
metadata Any

Metadata returned from fetch_metadata()

required
identifier str

Original paper identifier

required

Returns:

Type Description
str

PDF URL string (empty if not available)

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
45
46
47
48
49
50
51
52
53
54
55
56
57
@abstractmethod
def construct_pdf_url(self, metadata: Any, identifier: str) -> str:
    """
    Construct or extract PDF URL from metadata.

    Args:
        metadata: Metadata returned from fetch_metadata()
        identifier: Original paper identifier

    Returns:
        PDF URL string (empty if not available)
    """
    raise NotImplementedError

create_error_entry(identifier, error_msg)

Create standardized error entry for failed paper processing.

Parameters:

Name Type Description Default
identifier str

Paper identifier

required
error_msg str

Error message

required

Returns:

Type Description
Dict[str, Any]

Error entry dictionary

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def create_error_entry(self, identifier: str, error_msg: str) -> Dict[str, Any]:
    """
    Create standardized error entry for failed paper processing.

    Args:
        identifier: Paper identifier
        error_msg: Error message

    Returns:
        Error entry dictionary
    """
    return {
        "Title": "Error fetching paper",
        "Authors": [],
        "Abstract": f"Error: {error_msg}",
        "Publication Date": "N/A",
        "URL": "",
        "pdf_url": "",
        "filename": self.get_default_filename(identifier),
        "source": self.get_service_name(),
        "access_type": "error",
        "temp_file_path": "",
        "error": error_msg,
        # Service-specific identifier field will be added by subclasses
    }

download_pdf_to_temp(pdf_url, identifier)

Download PDF from URL to a temporary file.

Parameters:

Name Type Description Default
pdf_url str

URL to download PDF from

required
identifier str

Paper identifier for logging

required

Returns:

Type Description
Optional[Tuple[str, str]]

Tuple of (temp_file_path, filename) or None if failed

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def download_pdf_to_temp(
    self, pdf_url: str, identifier: str
) -> Optional[Tuple[str, str]]:
    """
    Download PDF from URL to a temporary file.

    Args:
        pdf_url: URL to download PDF from
        identifier: Paper identifier for logging

    Returns:
        Tuple of (temp_file_path, filename) or None if failed
    """
    if not pdf_url:
        logger.info(
            "No PDF URL available for %s %s", self.get_identifier_name(), identifier
        )
        return None

    try:
        logger.info(
            "Downloading PDF for %s %s from %s",
            self.get_identifier_name(),
            identifier,
            pdf_url,
        )

        headers = {"User-Agent": self.user_agent}
        response = requests.get(
            pdf_url, headers=headers, timeout=self.request_timeout, stream=True
        )
        response.raise_for_status()

        # Download to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            for chunk in response.iter_content(chunk_size=self.chunk_size):
                if chunk:  # Filter out keep-alive chunks
                    temp_file.write(chunk)
            temp_file_path = temp_file.name

        logger.info(
            "%s PDF downloaded to temporary file: %s",
            self.get_service_name(),
            temp_file_path,
        )

        # Try to extract filename from Content-Disposition header
        filename = self.get_default_filename(identifier)
        content_disposition = response.headers.get("Content-Disposition", "")

        if "filename=" in content_disposition:
            try:

                filename_match = re.search(
                    r'filename[*]?=(?:"([^"]+)"|([^;]+))', content_disposition
                )
                if filename_match:
                    extracted_filename = filename_match.group(
                        1
                    ) or filename_match.group(2)
                    extracted_filename = extracted_filename.strip().strip('"')
                    if extracted_filename and extracted_filename.endswith(".pdf"):
                        filename = extracted_filename
                        logger.info("Extracted filename from header: %s", filename)
            except requests.RequestException as e:
                logger.warning("Failed to extract filename from header: %s", e)

        return temp_file_path, filename

    except (requests.exceptions.RequestException, OSError) as e:
        logger.error(
            "Failed to download PDF for %s %s: %s",
            self.get_identifier_name(),
            identifier,
            e,
        )
        return None

extract_paper_metadata(metadata, identifier, pdf_result) abstractmethod

Extract and structure metadata into standardized format.

Parameters:

Name Type Description Default
metadata Any

Raw metadata from API

required
identifier str

Original paper identifier

required
pdf_result Optional[Tuple[str, str]]

Tuple of (temp_file_path, filename) if PDF downloaded

required

Returns:

Type Description
Dict[str, Any]

Standardized paper metadata dictionary

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
@abstractmethod
def extract_paper_metadata(
    self, metadata: Any, identifier: str, pdf_result: Optional[Tuple[str, str]]
) -> Dict[str, Any]:
    """
    Extract and structure metadata into standardized format.

    Args:
        metadata: Raw metadata from API
        identifier: Original paper identifier
        pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded

    Returns:
        Standardized paper metadata dictionary
    """
    raise NotImplementedError

fetch_metadata(identifier) abstractmethod

Fetch paper metadata from the service API.

Parameters:

Name Type Description Default
identifier str

Paper identifier (arXiv ID, DOI, PMID, etc.)

required

Returns:

Type Description
Any

Service-specific metadata object (XML, JSON, etc.)

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
32
33
34
35
36
37
38
39
40
41
42
43
@abstractmethod
def fetch_metadata(self, identifier: str) -> Any:
    """
    Fetch paper metadata from the service API.

    Args:
        identifier: Paper identifier (arXiv ID, DOI, PMID, etc.)

    Returns:
        Service-specific metadata object (XML, JSON, etc.)
    """
    raise NotImplementedError

get_default_filename(identifier) abstractmethod

Generate default filename for the paper PDF.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
86
87
88
89
@abstractmethod
def get_default_filename(self, identifier: str) -> str:
    """Generate default filename for the paper PDF."""
    raise NotImplementedError

get_identifier_name() abstractmethod

Return identifier display name (e.g., 'arXiv ID', 'DOI', 'PMID').

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
81
82
83
84
@abstractmethod
def get_identifier_name(self) -> str:
    """Return identifier display name (e.g., 'arXiv ID', 'DOI', 'PMID')."""
    raise NotImplementedError

get_service_name() abstractmethod

Return service name (e.g., 'arxiv', 'medrxiv', 'pubmed').

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
76
77
78
79
@abstractmethod
def get_service_name(self) -> str:
    """Return service name (e.g., 'arxiv', 'medrxiv', 'pubmed')."""
    raise NotImplementedError

get_snippet(abstract)

Extract the first one or two sentences from an abstract.

Parameters:

Name Type Description Default
abstract str

Full abstract text

required

Returns:

Type Description
str

Snippet of first 1-2 sentences

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def get_snippet(self, abstract: str) -> str:
    """
    Extract the first one or two sentences from an abstract.

    Args:
        abstract: Full abstract text

    Returns:
        Snippet of first 1-2 sentences
    """
    if not abstract or abstract == "N/A":
        return ""

    sentences = abstract.split(". ")
    snippet_sentences = sentences[:2]
    snippet = ". ".join(snippet_sentences)

    if not snippet.endswith("."):
        snippet += "."

    return snippet

process_identifiers(identifiers)

Main processing loop for downloading papers.

Parameters:

Name Type Description Default
identifiers List[str]

List of paper identifiers

required

Returns:

Type Description
Dict[str, Any]

Dictionary of paper data keyed by identifier

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
def process_identifiers(self, identifiers: List[str]) -> Dict[str, Any]:
    """
    Main processing loop for downloading papers.

    Args:
        identifiers: List of paper identifiers

    Returns:
        Dictionary of paper data keyed by identifier
    """
    logger.info(
        "Processing %d identifiers from %s: %s",
        len(identifiers),
        self.get_service_name(),
        identifiers,
    )

    article_data: Dict[str, Any] = {}

    for identifier in identifiers:
        logger.info("Processing %s: %s", self.get_identifier_name(), identifier)

        try:
            # Step 1: Fetch metadata
            metadata = self.fetch_metadata(identifier)

            # Step 2: Extract PDF URL
            pdf_url = self.construct_pdf_url(metadata, identifier)

            # Step 3: Download PDF if available
            pdf_result = None
            if pdf_url:
                pdf_result = self.download_pdf_to_temp(pdf_url, identifier)

            # Step 4: Extract and structure metadata
            article_data[identifier] = self.extract_paper_metadata(
                metadata, identifier, pdf_result
            )

        except requests.RequestException as e:
            logger.warning(
                "Error processing %s %s: %s",
                self.get_identifier_name(),
                identifier,
                str(e),
            )

            # Create error entry
            error_entry = self.create_error_entry(identifier, str(e))
            # Add service-specific identifier field
            self._add_service_identifier(error_entry, identifier)
            article_data[identifier] = error_entry

    return article_data