Skip to content

Medrxiv Downloader

MedRxiv paper downloader implementation.

MedrxivDownloader

Bases: BasePaperDownloader

MedRxiv-specific implementation of paper downloader.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
class MedrxivDownloader(BasePaperDownloader):
    """MedRxiv-specific implementation of paper downloader."""

    def __init__(self, config: Any):
        """Initialize MedRxiv downloader with configuration."""
        super().__init__(config)
        self.api_url = config.api_url
        self.pdf_url_template = getattr(
            config,
            "pdf_url_template",
            "https://www.medrxiv.org/content/{identifier}v{version}.full.pdf",
        )
        self.default_version = getattr(config, "default_version", "1")

    def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
        """
        Fetch paper metadata from medRxiv API.

        Args:
            identifier: DOI (e.g., '10.1101/2020.09.09.20191205')

        Returns:
            JSON response as dictionary from medRxiv API

        Raises:
            requests.RequestException: If API call fails
            RuntimeError: If no collection data found in response
        """
        query_url = f"{self.api_url}/medrxiv/{identifier}/na/json"
        logger.info("Fetching metadata for DOI %s from: %s", identifier, query_url)

        response = requests.get(query_url, timeout=self.request_timeout)
        response.raise_for_status()

        paper_data = response.json()

        if "collection" not in paper_data or not paper_data["collection"]:
            raise RuntimeError("No collection data found in medRxiv API response")

        return paper_data

    def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
        """
        Construct PDF URL from medRxiv metadata and DOI.

        Args:
            metadata: JSON response from medRxiv API
            identifier: DOI

        Returns:
            Constructed PDF URL string
        """
        if "collection" not in metadata or not metadata["collection"]:
            return ""

        paper = metadata["collection"][0]  # Get first (and should be only) paper
        version = paper.get("version", self.default_version)

        # Construct medRxiv PDF URL using template
        pdf_url = self.pdf_url_template.format(identifier=identifier, version=version)
        logger.info("Constructed PDF URL for DOI %s: %s", identifier, pdf_url)

        return pdf_url

    def extract_paper_metadata(
        self,
        metadata: Dict[str, Any],
        identifier: str,
        pdf_result: Optional[Tuple[str, str]],
    ) -> Dict[str, Any]:
        """
        Extract structured metadata from medRxiv API response.

        Args:
            metadata: JSON response from medRxiv API
            identifier: DOI
            pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded

        Returns:
            Standardized paper metadata dictionary
        """
        if "collection" not in metadata or not metadata["collection"]:
            raise RuntimeError("No collection data found in metadata")

        paper = metadata["collection"][0]  # Get first (and should be only) paper

        # Extract basic metadata
        basic_metadata = self._extract_basic_metadata(paper, identifier)

        # Handle PDF download results
        pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)

        # Combine all metadata
        return {
            **basic_metadata,
            **pdf_metadata,
        }

    def _extract_basic_metadata(
        self, paper: Dict[str, Any], identifier: str
    ) -> Dict[str, Any]:
        """Extract basic metadata from paper data."""
        # Extract basic fields
        title = paper.get("title", "N/A").strip()
        abstract = paper.get("abstract", "N/A").strip()
        pub_date = paper.get("date", "N/A").strip()
        category = paper.get("category", "N/A").strip()
        version = paper.get("version", "N/A")

        # Extract authors - typically in a semicolon-separated string
        authors = self._extract_authors(paper.get("authors", ""))

        return {
            "Title": title,
            "Authors": authors,
            "Abstract": abstract,
            "Publication Date": pub_date,
            "DOI": identifier,
            "Category": category,
            "Version": version,
            "source": "medrxiv",
            "server": "medrxiv",
        }

    def _extract_authors(self, authors_str: str) -> list:
        """Extract and clean authors from semicolon-separated string."""
        if not authors_str:
            return []
        return [author.strip() for author in authors_str.split(";") if author.strip()]

    def _extract_pdf_metadata(
        self, pdf_result: Optional[Tuple[str, str]], identifier: str
    ) -> Dict[str, Any]:
        """Extract PDF-related metadata."""
        if pdf_result:
            temp_file_path, filename = pdf_result
            return {
                "URL": temp_file_path,
                "pdf_url": temp_file_path,
                "filename": filename,
                "access_type": "open_access_downloaded",
                "temp_file_path": temp_file_path,
            }

        return {
            "URL": "",
            "pdf_url": "",
            "filename": self.get_default_filename(identifier),
            "access_type": "download_failed",
            "temp_file_path": "",
        }

    def get_service_name(self) -> str:
        """Return service name."""
        return "medRxiv"

    def get_identifier_name(self) -> str:
        """Return identifier display name."""
        return "DOI"

    def get_default_filename(self, identifier: str) -> str:
        """Generate default filename for medRxiv paper."""
        # Sanitize DOI for filename use
        return f"{identifier.replace('/', '_').replace('.', '_')}.pdf"

    def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
        """Get medRxiv-specific identifier info for paper summary."""
        doi = paper.get("DOI", "N/A")
        pub_date = paper.get("Publication Date", "N/A")
        category = paper.get("Category", "N/A")

        info = f" (DOI:{doi}, {pub_date})"
        if category != "N/A":
            info += f"\n   Category: {category}"

        return info

    def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
        """Add DOI and medRxiv-specific fields to entry."""
        entry["DOI"] = identifier
        entry["Category"] = "N/A"
        entry["Version"] = "N/A"
        entry["server"] = "medrxiv"

__init__(config)

Initialize MedRxiv downloader with configuration.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
19
20
21
22
23
24
25
26
27
28
def __init__(self, config: Any):
    """Initialize MedRxiv downloader with configuration."""
    super().__init__(config)
    self.api_url = config.api_url
    self.pdf_url_template = getattr(
        config,
        "pdf_url_template",
        "https://www.medrxiv.org/content/{identifier}v{version}.full.pdf",
    )
    self.default_version = getattr(config, "default_version", "1")

_add_service_identifier(entry, identifier)

Add DOI and medRxiv-specific fields to entry.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
193
194
195
196
197
198
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
    """Add DOI and medRxiv-specific fields to entry."""
    entry["DOI"] = identifier
    entry["Category"] = "N/A"
    entry["Version"] = "N/A"
    entry["server"] = "medrxiv"

_extract_authors(authors_str)

Extract and clean authors from semicolon-separated string.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
140
141
142
143
144
def _extract_authors(self, authors_str: str) -> list:
    """Extract and clean authors from semicolon-separated string."""
    if not authors_str:
        return []
    return [author.strip() for author in authors_str.split(";") if author.strip()]

_extract_basic_metadata(paper, identifier)

Extract basic metadata from paper data.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def _extract_basic_metadata(
    self, paper: Dict[str, Any], identifier: str
) -> Dict[str, Any]:
    """Extract basic metadata from paper data."""
    # Extract basic fields
    title = paper.get("title", "N/A").strip()
    abstract = paper.get("abstract", "N/A").strip()
    pub_date = paper.get("date", "N/A").strip()
    category = paper.get("category", "N/A").strip()
    version = paper.get("version", "N/A")

    # Extract authors - typically in a semicolon-separated string
    authors = self._extract_authors(paper.get("authors", ""))

    return {
        "Title": title,
        "Authors": authors,
        "Abstract": abstract,
        "Publication Date": pub_date,
        "DOI": identifier,
        "Category": category,
        "Version": version,
        "source": "medrxiv",
        "server": "medrxiv",
    }

_extract_pdf_metadata(pdf_result, identifier)

Extract PDF-related metadata.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def _extract_pdf_metadata(
    self, pdf_result: Optional[Tuple[str, str]], identifier: str
) -> Dict[str, Any]:
    """Extract PDF-related metadata."""
    if pdf_result:
        temp_file_path, filename = pdf_result
        return {
            "URL": temp_file_path,
            "pdf_url": temp_file_path,
            "filename": filename,
            "access_type": "open_access_downloaded",
            "temp_file_path": temp_file_path,
        }

    return {
        "URL": "",
        "pdf_url": "",
        "filename": self.get_default_filename(identifier),
        "access_type": "download_failed",
        "temp_file_path": "",
    }

_get_paper_identifier_info(paper)

Get medRxiv-specific identifier info for paper summary.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
181
182
183
184
185
186
187
188
189
190
191
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
    """Get medRxiv-specific identifier info for paper summary."""
    doi = paper.get("DOI", "N/A")
    pub_date = paper.get("Publication Date", "N/A")
    category = paper.get("Category", "N/A")

    info = f" (DOI:{doi}, {pub_date})"
    if category != "N/A":
        info += f"\n   Category: {category}"

    return info

construct_pdf_url(metadata, identifier)

Construct PDF URL from medRxiv metadata and DOI.

Parameters:

Name Type Description Default
metadata Dict[str, Any]

JSON response from medRxiv API

required
identifier str

DOI

required

Returns:

Type Description
str

Constructed PDF URL string

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
    """
    Construct PDF URL from medRxiv metadata and DOI.

    Args:
        metadata: JSON response from medRxiv API
        identifier: DOI

    Returns:
        Constructed PDF URL string
    """
    if "collection" not in metadata or not metadata["collection"]:
        return ""

    paper = metadata["collection"][0]  # Get first (and should be only) paper
    version = paper.get("version", self.default_version)

    # Construct medRxiv PDF URL using template
    pdf_url = self.pdf_url_template.format(identifier=identifier, version=version)
    logger.info("Constructed PDF URL for DOI %s: %s", identifier, pdf_url)

    return pdf_url

extract_paper_metadata(metadata, identifier, pdf_result)

Extract structured metadata from medRxiv API response.

Parameters:

Name Type Description Default
metadata Dict[str, Any]

JSON response from medRxiv API

required
identifier str

DOI

required
pdf_result Optional[Tuple[str, str]]

Tuple of (temp_file_path, filename) if PDF downloaded

required

Returns:

Type Description
Dict[str, Any]

Standardized paper metadata dictionary

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def extract_paper_metadata(
    self,
    metadata: Dict[str, Any],
    identifier: str,
    pdf_result: Optional[Tuple[str, str]],
) -> Dict[str, Any]:
    """
    Extract structured metadata from medRxiv API response.

    Args:
        metadata: JSON response from medRxiv API
        identifier: DOI
        pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded

    Returns:
        Standardized paper metadata dictionary
    """
    if "collection" not in metadata or not metadata["collection"]:
        raise RuntimeError("No collection data found in metadata")

    paper = metadata["collection"][0]  # Get first (and should be only) paper

    # Extract basic metadata
    basic_metadata = self._extract_basic_metadata(paper, identifier)

    # Handle PDF download results
    pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)

    # Combine all metadata
    return {
        **basic_metadata,
        **pdf_metadata,
    }

fetch_metadata(identifier)

Fetch paper metadata from medRxiv API.

Parameters:

Name Type Description Default
identifier str

DOI (e.g., '10.1101/2020.09.09.20191205')

required

Returns:

Type Description
Dict[str, Any]

JSON response as dictionary from medRxiv API

Raises:

Type Description
RequestException

If API call fails

RuntimeError

If no collection data found in response

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
    """
    Fetch paper metadata from medRxiv API.

    Args:
        identifier: DOI (e.g., '10.1101/2020.09.09.20191205')

    Returns:
        JSON response as dictionary from medRxiv API

    Raises:
        requests.RequestException: If API call fails
        RuntimeError: If no collection data found in response
    """
    query_url = f"{self.api_url}/medrxiv/{identifier}/na/json"
    logger.info("Fetching metadata for DOI %s from: %s", identifier, query_url)

    response = requests.get(query_url, timeout=self.request_timeout)
    response.raise_for_status()

    paper_data = response.json()

    if "collection" not in paper_data or not paper_data["collection"]:
        raise RuntimeError("No collection data found in medRxiv API response")

    return paper_data

get_default_filename(identifier)

Generate default filename for medRxiv paper.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
176
177
178
179
def get_default_filename(self, identifier: str) -> str:
    """Generate default filename for medRxiv paper."""
    # Sanitize DOI for filename use
    return f"{identifier.replace('/', '_').replace('.', '_')}.pdf"

get_identifier_name()

Return identifier display name.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
172
173
174
def get_identifier_name(self) -> str:
    """Return identifier display name."""
    return "DOI"

get_service_name()

Return service name.

Source code in aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py
168
169
170
def get_service_name(self) -> str:
    """Return service name."""
    return "medRxiv"