Skip to content

Pubmed Downloader

PubMed paper downloader implementation.

PubmedDownloader

Bases: BasePaperDownloader

PubMed-specific implementation of paper downloader.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
class PubmedDownloader(BasePaperDownloader):
    """PubMed-specific implementation of paper downloader."""

    def __init__(self, config: Any):
        """Initialize PubMed downloader with configuration."""
        super().__init__(config)
        self.id_converter_url = config.id_converter_url
        self.oa_api_url = config.oa_api_url

        # Alternative PDF sources
        self.europe_pmc_base_url = config.europe_pmc_base_url
        self.pmc_page_base_url = config.pmc_page_base_url
        self.direct_pmc_pdf_base_url = config.direct_pmc_pdf_base_url

        # URL conversion for NCBI FTP links
        self.ftp_base_url = config.ftp_base_url
        self.https_base_url = config.https_base_url
        # Configuration values
        self.id_converter_format = getattr(config, "id_converter_format", "json")
        self.pdf_meta_name = getattr(config, "pdf_meta_name", "citation_pdf_url")
        self.default_error_code = getattr(config, "default_error_code", "unknown")

    def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
        """
        Fetch paper metadata from PubMed ID Converter API.

        Args:
            identifier: PMID (e.g., '12345678')

        Returns:
            JSON response from PMC ID Converter API

        Raises:
            requests.RequestException: If API call fails
            RuntimeError: If no records found in response
        """
        query_url = f"{self.id_converter_url}?ids={identifier}&format={self.id_converter_format}"
        logger.info(
            "Fetching metadata from ID converter for PMID %s: %s", identifier, query_url
        )

        response = requests.get(query_url, timeout=self.request_timeout)
        response.raise_for_status()

        result = response.json()
        logger.info("ID converter response for PMID %s: %s", identifier, result)

        if "records" not in result or not result["records"]:
            raise RuntimeError("No records found in PMC ID Converter API response")

        return result

    def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
        """
        Construct PDF URL using multiple fallback strategies.

        Args:
            metadata: JSON response from ID converter
            identifier: PMID

        Returns:
            PDF URL string (empty if no PDF available)
        """
        if "records" not in metadata or not metadata["records"]:
            return ""

        record = metadata["records"][0]
        pmcid = record.get("pmcid", "")

        if not pmcid or pmcid == "N/A":
            logger.info("No PMCID available for PDF fetch: PMID %s", identifier)
            return ""

        return self._fetch_pdf_url_with_fallbacks(pmcid)

    def _fetch_pdf_url_with_fallbacks(self, pmcid: str) -> str:
        """
        Fetch PDF URL from OA API with comprehensive fallback strategies.

        Args:
            pmcid: PMC ID (e.g., 'PMC1234567')

        Returns:
            PDF URL string (empty if all strategies fail)
        """
        logger.info("Fetching PDF URL for PMCID: %s", pmcid)

        # Strategy 1: Official OA API (fastest when it works)
        pdf_url = self._try_oa_api(pmcid)
        if pdf_url:
            return pdf_url

        # Strategy 2: Europe PMC Service (most reliable fallback)
        pdf_url = self._try_europe_pmc(pmcid)
        if pdf_url:
            return pdf_url

        # Strategy 3: Scrape PMC page for citation_pdf_url meta tag
        pdf_url = self._try_pmc_page_scraping(pmcid)
        if pdf_url:
            return pdf_url

        # Strategy 4: Direct PMC PDF URL pattern (least reliable)
        pdf_url = self._try_direct_pmc_url(pmcid)
        if pdf_url:
            return pdf_url

        logger.warning("All PDF URL strategies failed for PMCID: %s", pmcid)
        return ""

    def _try_oa_api(self, pmcid: str) -> str:
        """Try to get PDF URL from official OA API."""
        query_url = f"{self.oa_api_url}?id={pmcid}"
        logger.info("Trying OA API for PMCID %s: %s", pmcid, query_url)

        try:
            response = requests.get(query_url, timeout=self.request_timeout)
            response.raise_for_status()

            logger.info("OA API response for PMCID %s: %s", pmcid, response.text[:500])

            # Parse XML response

            root = ET.fromstring(response.text)

            # Check for error first
            error_elem = root.find(".//error")
            if error_elem is not None:
                error_code = error_elem.get("code", self.default_error_code)
                error_text = error_elem.text or "unknown error"
                logger.info(
                    "OA API error for PMCID %s: %s - %s", pmcid, error_code, error_text
                )
                return ""

            # Look for PDF link
            pdf_link = root.find(".//link[@format='pdf']")
            if pdf_link is not None:
                pdf_url = pdf_link.get("href", "")
                logger.info(
                    "Found PDF URL from OA API for PMCID %s: %s", pmcid, pdf_url
                )

                # Convert FTP links to HTTPS for download compatibility
                if pdf_url.startswith(self.ftp_base_url):
                    pdf_url = pdf_url.replace(self.ftp_base_url, self.https_base_url)
                    logger.info("Converted FTP to HTTPS for %s: %s", pmcid, pdf_url)

                return pdf_url

        except requests.RequestException as e:
            logger.info("OA API failed for %s: %s", pmcid, str(e))

        return ""

    def _try_europe_pmc(self, pmcid: str) -> str:
        """Try Europe PMC service for PDF."""
        europe_pmc_url = f"{self.europe_pmc_base_url}?accid={pmcid}&blobtype=pdf"
        logger.info("Trying Europe PMC service for %s: %s", pmcid, europe_pmc_url)

        try:
            response = requests.head(europe_pmc_url, timeout=self.request_timeout)
            if response.status_code == 200:
                logger.info("Europe PMC service works for %s", pmcid)
                return europe_pmc_url
        except requests.RequestException as e:
            logger.info("Europe PMC service failed for %s: %s", pmcid, str(e))

        return ""

    def _try_pmc_page_scraping(self, pmcid: str) -> str:
        """Try scraping PMC page for PDF meta tag."""
        pmc_page_url = f"{self.pmc_page_base_url}/{pmcid}/"
        logger.info(
            "Scraping PMC page for PDF meta tag for %s: %s", pmcid, pmc_page_url
        )

        try:
            headers = {"User-Agent": self.user_agent}
            response = requests.get(
                pmc_page_url, headers=headers, timeout=self.request_timeout
            )
            response.raise_for_status()

            soup = BeautifulSoup(response.content, "html.parser")

            # Look for PDF meta tag
            pdf_meta = soup.find("meta", attrs={"name": self.pdf_meta_name})
            if pdf_meta is not None:
                # Cast to Tag to help type checker understand this is a BeautifulSoup Tag object
                meta_tag = cast(Tag, pdf_meta)
                content = meta_tag.get("content")
                if content:
                    logger.info(
                        "Found %s meta tag for %s: %s",
                        self.pdf_meta_name,
                        pmcid,
                        content,
                    )
                    return str(content)

        except requests.RequestException as e:
            logger.info("PMC page scraping failed for %s: %s", pmcid, str(e))

        return ""

    def _try_direct_pmc_url(self, pmcid: str) -> str:
        """Try direct PMC PDF URL pattern."""
        direct_pmc_url = f"{self.direct_pmc_pdf_base_url}/{pmcid}/pdf/"
        logger.info("Trying direct PMC PDF URL for %s: %s", pmcid, direct_pmc_url)

        try:
            response = requests.head(direct_pmc_url, timeout=self.request_timeout)
            if response.status_code == 200:
                logger.info("Direct PMC PDF URL works for %s", pmcid)
                return direct_pmc_url
        except requests.RequestException as e:
            logger.info("Direct PMC PDF URL failed for %s: %s", pmcid, str(e))

        return ""

    def extract_paper_metadata(
        self,
        metadata: Dict[str, Any],
        identifier: str,
        pdf_result: Optional[Tuple[str, str]],
    ) -> Dict[str, Any]:
        """
        Extract structured metadata from PubMed ID converter response.

        Args:
            metadata: JSON response from ID converter
            identifier: PMID
            pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded

        Returns:
            Standardized paper metadata dictionary
        """
        if "records" not in metadata or not metadata["records"]:
            raise RuntimeError("No records found in metadata")

        record = metadata["records"][0]  # Get first (and should be only) record

        # Extract basic fields from ID converter
        pmcid = record.get("pmcid", "N/A")
        doi = record.get("doi", "N/A")

        # Handle PDF download results
        if pdf_result:
            temp_file_path, filename = pdf_result
            access_type = "open_access_downloaded"
            pdf_url = temp_file_path  # Use local temp file path
        else:
            temp_file_path = ""
            filename = self.get_default_filename(identifier)
            access_type = "abstract_only" if pmcid != "N/A" else "no_pmcid"
            pdf_url = ""

        # Note: For PubMed, we don't get title/authors from ID converter
        # In a real implementation, you might want to call E-utilities for full metadata
        # For now, we'll use placeholders and focus on the ID conversion functionality

        return {
            "Title": (
                f"PubMed Article {identifier}"
            ),  # Placeholder - would need E-utilities for real title
            "Authors": [],  # Placeholder - would need E-utilities for real authors
            "Abstract": "Abstract available in PubMed",  # Placeholder
            "Publication Date": "N/A",  # Would need E-utilities for this
            "PMID": identifier,
            "PMCID": pmcid,
            "DOI": doi,
            "Journal": "N/A",  # Would need E-utilities for this
            "URL": pdf_url,
            "pdf_url": pdf_url,
            "access_type": access_type,
            "filename": filename,
            "source": "pubmed",
            "temp_file_path": temp_file_path,
        }

    def get_service_name(self) -> str:
        """Return service name."""
        return "PubMed"

    def get_identifier_name(self) -> str:
        """Return identifier display name."""
        return "PMID"

    def get_default_filename(self, identifier: str) -> str:
        """Generate default filename for PubMed paper."""
        return f"pmid_{identifier}.pdf"

    def get_snippet(self, abstract: str) -> str:
        """Override to handle PubMed-specific abstract placeholder."""
        if (
            not abstract
            or abstract == "N/A"
            or abstract == "Abstract available in PubMed"
        ):
            return ""
        return super().get_snippet(abstract)

    def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
        """Get PubMed-specific identifier info for paper summary."""
        pmid = paper.get("PMID", "N/A")
        pmcid = paper.get("PMCID", "N/A")

        info = f" (PMID: {pmid})"
        if pmcid != "N/A":
            info += f"\n   PMCID: {pmcid}"

        return info

    def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
        """Add PMID and PubMed-specific fields to entry."""
        entry["PMID"] = identifier
        entry["PMCID"] = "N/A"
        entry["DOI"] = "N/A"
        entry["Journal"] = "N/A"

__init__(config)

Initialize PubMed downloader with configuration.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def __init__(self, config: Any):
    """Initialize PubMed downloader with configuration."""
    super().__init__(config)
    self.id_converter_url = config.id_converter_url
    self.oa_api_url = config.oa_api_url

    # Alternative PDF sources
    self.europe_pmc_base_url = config.europe_pmc_base_url
    self.pmc_page_base_url = config.pmc_page_base_url
    self.direct_pmc_pdf_base_url = config.direct_pmc_pdf_base_url

    # URL conversion for NCBI FTP links
    self.ftp_base_url = config.ftp_base_url
    self.https_base_url = config.https_base_url
    # Configuration values
    self.id_converter_format = getattr(config, "id_converter_format", "json")
    self.pdf_meta_name = getattr(config, "pdf_meta_name", "citation_pdf_url")
    self.default_error_code = getattr(config, "default_error_code", "unknown")

_add_service_identifier(entry, identifier)

Add PMID and PubMed-specific fields to entry.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
332
333
334
335
336
337
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
    """Add PMID and PubMed-specific fields to entry."""
    entry["PMID"] = identifier
    entry["PMCID"] = "N/A"
    entry["DOI"] = "N/A"
    entry["Journal"] = "N/A"

_fetch_pdf_url_with_fallbacks(pmcid)

Fetch PDF URL from OA API with comprehensive fallback strategies.

Parameters:

Name Type Description Default
pmcid str

PMC ID (e.g., 'PMC1234567')

required

Returns:

Type Description
str

PDF URL string (empty if all strategies fail)

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def _fetch_pdf_url_with_fallbacks(self, pmcid: str) -> str:
    """
    Fetch PDF URL from OA API with comprehensive fallback strategies.

    Args:
        pmcid: PMC ID (e.g., 'PMC1234567')

    Returns:
        PDF URL string (empty if all strategies fail)
    """
    logger.info("Fetching PDF URL for PMCID: %s", pmcid)

    # Strategy 1: Official OA API (fastest when it works)
    pdf_url = self._try_oa_api(pmcid)
    if pdf_url:
        return pdf_url

    # Strategy 2: Europe PMC Service (most reliable fallback)
    pdf_url = self._try_europe_pmc(pmcid)
    if pdf_url:
        return pdf_url

    # Strategy 3: Scrape PMC page for citation_pdf_url meta tag
    pdf_url = self._try_pmc_page_scraping(pmcid)
    if pdf_url:
        return pdf_url

    # Strategy 4: Direct PMC PDF URL pattern (least reliable)
    pdf_url = self._try_direct_pmc_url(pmcid)
    if pdf_url:
        return pdf_url

    logger.warning("All PDF URL strategies failed for PMCID: %s", pmcid)
    return ""

_get_paper_identifier_info(paper)

Get PubMed-specific identifier info for paper summary.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
321
322
323
324
325
326
327
328
329
330
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
    """Get PubMed-specific identifier info for paper summary."""
    pmid = paper.get("PMID", "N/A")
    pmcid = paper.get("PMCID", "N/A")

    info = f" (PMID: {pmid})"
    if pmcid != "N/A":
        info += f"\n   PMCID: {pmcid}"

    return info

_try_direct_pmc_url(pmcid)

Try direct PMC PDF URL pattern.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def _try_direct_pmc_url(self, pmcid: str) -> str:
    """Try direct PMC PDF URL pattern."""
    direct_pmc_url = f"{self.direct_pmc_pdf_base_url}/{pmcid}/pdf/"
    logger.info("Trying direct PMC PDF URL for %s: %s", pmcid, direct_pmc_url)

    try:
        response = requests.head(direct_pmc_url, timeout=self.request_timeout)
        if response.status_code == 200:
            logger.info("Direct PMC PDF URL works for %s", pmcid)
            return direct_pmc_url
    except requests.RequestException as e:
        logger.info("Direct PMC PDF URL failed for %s: %s", pmcid, str(e))

    return ""

_try_europe_pmc(pmcid)

Try Europe PMC service for PDF.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def _try_europe_pmc(self, pmcid: str) -> str:
    """Try Europe PMC service for PDF."""
    europe_pmc_url = f"{self.europe_pmc_base_url}?accid={pmcid}&blobtype=pdf"
    logger.info("Trying Europe PMC service for %s: %s", pmcid, europe_pmc_url)

    try:
        response = requests.head(europe_pmc_url, timeout=self.request_timeout)
        if response.status_code == 200:
            logger.info("Europe PMC service works for %s", pmcid)
            return europe_pmc_url
    except requests.RequestException as e:
        logger.info("Europe PMC service failed for %s: %s", pmcid, str(e))

    return ""

_try_oa_api(pmcid)

Try to get PDF URL from official OA API.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def _try_oa_api(self, pmcid: str) -> str:
    """Try to get PDF URL from official OA API."""
    query_url = f"{self.oa_api_url}?id={pmcid}"
    logger.info("Trying OA API for PMCID %s: %s", pmcid, query_url)

    try:
        response = requests.get(query_url, timeout=self.request_timeout)
        response.raise_for_status()

        logger.info("OA API response for PMCID %s: %s", pmcid, response.text[:500])

        # Parse XML response

        root = ET.fromstring(response.text)

        # Check for error first
        error_elem = root.find(".//error")
        if error_elem is not None:
            error_code = error_elem.get("code", self.default_error_code)
            error_text = error_elem.text or "unknown error"
            logger.info(
                "OA API error for PMCID %s: %s - %s", pmcid, error_code, error_text
            )
            return ""

        # Look for PDF link
        pdf_link = root.find(".//link[@format='pdf']")
        if pdf_link is not None:
            pdf_url = pdf_link.get("href", "")
            logger.info(
                "Found PDF URL from OA API for PMCID %s: %s", pmcid, pdf_url
            )

            # Convert FTP links to HTTPS for download compatibility
            if pdf_url.startswith(self.ftp_base_url):
                pdf_url = pdf_url.replace(self.ftp_base_url, self.https_base_url)
                logger.info("Converted FTP to HTTPS for %s: %s", pmcid, pdf_url)

            return pdf_url

    except requests.RequestException as e:
        logger.info("OA API failed for %s: %s", pmcid, str(e))

    return ""

_try_pmc_page_scraping(pmcid)

Try scraping PMC page for PDF meta tag.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def _try_pmc_page_scraping(self, pmcid: str) -> str:
    """Try scraping PMC page for PDF meta tag."""
    pmc_page_url = f"{self.pmc_page_base_url}/{pmcid}/"
    logger.info(
        "Scraping PMC page for PDF meta tag for %s: %s", pmcid, pmc_page_url
    )

    try:
        headers = {"User-Agent": self.user_agent}
        response = requests.get(
            pmc_page_url, headers=headers, timeout=self.request_timeout
        )
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")

        # Look for PDF meta tag
        pdf_meta = soup.find("meta", attrs={"name": self.pdf_meta_name})
        if pdf_meta is not None:
            # Cast to Tag to help type checker understand this is a BeautifulSoup Tag object
            meta_tag = cast(Tag, pdf_meta)
            content = meta_tag.get("content")
            if content:
                logger.info(
                    "Found %s meta tag for %s: %s",
                    self.pdf_meta_name,
                    pmcid,
                    content,
                )
                return str(content)

    except requests.RequestException as e:
        logger.info("PMC page scraping failed for %s: %s", pmcid, str(e))

    return ""

construct_pdf_url(metadata, identifier)

Construct PDF URL using multiple fallback strategies.

Parameters:

Name Type Description Default
metadata Dict[str, Any]

JSON response from ID converter

required
identifier str

PMID

required

Returns:

Type Description
str

PDF URL string (empty if no PDF available)

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
    """
    Construct PDF URL using multiple fallback strategies.

    Args:
        metadata: JSON response from ID converter
        identifier: PMID

    Returns:
        PDF URL string (empty if no PDF available)
    """
    if "records" not in metadata or not metadata["records"]:
        return ""

    record = metadata["records"][0]
    pmcid = record.get("pmcid", "")

    if not pmcid or pmcid == "N/A":
        logger.info("No PMCID available for PDF fetch: PMID %s", identifier)
        return ""

    return self._fetch_pdf_url_with_fallbacks(pmcid)

extract_paper_metadata(metadata, identifier, pdf_result)

Extract structured metadata from PubMed ID converter response.

Parameters:

Name Type Description Default
metadata Dict[str, Any]

JSON response from ID converter

required
identifier str

PMID

required
pdf_result Optional[Tuple[str, str]]

Tuple of (temp_file_path, filename) if PDF downloaded

required

Returns:

Type Description
Dict[str, Any]

Standardized paper metadata dictionary

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
def extract_paper_metadata(
    self,
    metadata: Dict[str, Any],
    identifier: str,
    pdf_result: Optional[Tuple[str, str]],
) -> Dict[str, Any]:
    """
    Extract structured metadata from PubMed ID converter response.

    Args:
        metadata: JSON response from ID converter
        identifier: PMID
        pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded

    Returns:
        Standardized paper metadata dictionary
    """
    if "records" not in metadata or not metadata["records"]:
        raise RuntimeError("No records found in metadata")

    record = metadata["records"][0]  # Get first (and should be only) record

    # Extract basic fields from ID converter
    pmcid = record.get("pmcid", "N/A")
    doi = record.get("doi", "N/A")

    # Handle PDF download results
    if pdf_result:
        temp_file_path, filename = pdf_result
        access_type = "open_access_downloaded"
        pdf_url = temp_file_path  # Use local temp file path
    else:
        temp_file_path = ""
        filename = self.get_default_filename(identifier)
        access_type = "abstract_only" if pmcid != "N/A" else "no_pmcid"
        pdf_url = ""

    # Note: For PubMed, we don't get title/authors from ID converter
    # In a real implementation, you might want to call E-utilities for full metadata
    # For now, we'll use placeholders and focus on the ID conversion functionality

    return {
        "Title": (
            f"PubMed Article {identifier}"
        ),  # Placeholder - would need E-utilities for real title
        "Authors": [],  # Placeholder - would need E-utilities for real authors
        "Abstract": "Abstract available in PubMed",  # Placeholder
        "Publication Date": "N/A",  # Would need E-utilities for this
        "PMID": identifier,
        "PMCID": pmcid,
        "DOI": doi,
        "Journal": "N/A",  # Would need E-utilities for this
        "URL": pdf_url,
        "pdf_url": pdf_url,
        "access_type": access_type,
        "filename": filename,
        "source": "pubmed",
        "temp_file_path": temp_file_path,
    }

fetch_metadata(identifier)

Fetch paper metadata from PubMed ID Converter API.

Parameters:

Name Type Description Default
identifier str

PMID (e.g., '12345678')

required

Returns:

Type Description
Dict[str, Any]

JSON response from PMC ID Converter API

Raises:

Type Description
RequestException

If API call fails

RuntimeError

If no records found in response

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
    """
    Fetch paper metadata from PubMed ID Converter API.

    Args:
        identifier: PMID (e.g., '12345678')

    Returns:
        JSON response from PMC ID Converter API

    Raises:
        requests.RequestException: If API call fails
        RuntimeError: If no records found in response
    """
    query_url = f"{self.id_converter_url}?ids={identifier}&format={self.id_converter_format}"
    logger.info(
        "Fetching metadata from ID converter for PMID %s: %s", identifier, query_url
    )

    response = requests.get(query_url, timeout=self.request_timeout)
    response.raise_for_status()

    result = response.json()
    logger.info("ID converter response for PMID %s: %s", identifier, result)

    if "records" not in result or not result["records"]:
        raise RuntimeError("No records found in PMC ID Converter API response")

    return result

get_default_filename(identifier)

Generate default filename for PubMed paper.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
307
308
309
def get_default_filename(self, identifier: str) -> str:
    """Generate default filename for PubMed paper."""
    return f"pmid_{identifier}.pdf"

get_identifier_name()

Return identifier display name.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
303
304
305
def get_identifier_name(self) -> str:
    """Return identifier display name."""
    return "PMID"

get_service_name()

Return service name.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
299
300
301
def get_service_name(self) -> str:
    """Return service name."""
    return "PubMed"

get_snippet(abstract)

Override to handle PubMed-specific abstract placeholder.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py
311
312
313
314
315
316
317
318
319
def get_snippet(self, abstract: str) -> str:
    """Override to handle PubMed-specific abstract placeholder."""
    if (
        not abstract
        or abstract == "N/A"
        or abstract == "Abstract available in PubMed"
    ):
        return ""
    return super().get_snippet(abstract)