Skip to content

Zotero PDF Downloader

Utility functions for downloading PDFs from Zotero.

download_pdfs_in_parallel(session, user_id, api_key, attachment_item_map, **kwargs)

Download multiple PDFs in parallel using ThreadPoolExecutor.

Parameters:

Name Type Description Default
session Session

requests.Session for HTTP requests.

required
user_id str

Zotero user ID.

required
api_key str

Zotero API key.

required
attachment_item_map Dict[str, str]

Mapping of attachment_key to parent item_key.

required
kwargs

max_workers (int, optional): Maximum number of worker threads (default: min(10, n)). chunk_size (int, optional): Chunk size for streaming.

{}

Returns:

Type Description
Dict[str, Tuple[str, str, str]]

Mapping of parent item_key to (local_file_path, filename, attachment_key).

Source code in aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def download_pdfs_in_parallel(
    session: requests.Session,
    user_id: str,
    api_key: str,
    attachment_item_map: Dict[str, str],
    **kwargs,
) -> Dict[str, Tuple[str, str, str]]:
    """
    Download multiple PDFs in parallel using ThreadPoolExecutor.

    Args:
        session: requests.Session for HTTP requests.
        user_id: Zotero user ID.
        api_key: Zotero API key.
        attachment_item_map: Mapping of attachment_key to parent item_key.
        kwargs:
            max_workers (int, optional): Maximum number of worker threads (default: min(10, n)).
            chunk_size (int, optional): Chunk size for streaming.

    Returns:
        Mapping of parent item_key to (local_file_path, filename, attachment_key).
    """
    # Extract optional parameters
    max_workers = kwargs.get("max_workers")
    chunk_size = kwargs.get("chunk_size")
    # Log configured parameters for verification
    logger.info(
        "download_pdfs_in_parallel params -> max_workers=%s, chunk_size=%s", 
        max_workers,
        chunk_size,
    )
    results: Dict[str, Tuple[str, str, str]] = {}
    if not attachment_item_map:
        return results

    with concurrent.futures.ThreadPoolExecutor(
        max_workers=(
            max_workers
            if max_workers is not None
            else min(10, len(attachment_item_map))
        )
    ) as executor:
        future_to_keys = {
            executor.submit(
                download_zotero_pdf,
                session,
                user_id,
                api_key,
                attachment_key,
                chunk_size=chunk_size,
            ): (attachment_key, item_key)
            for attachment_key, item_key in attachment_item_map.items()
        }

        for future in concurrent.futures.as_completed(future_to_keys):
            attachment_key, item_key = future_to_keys[future]
            try:
                res = future.result()
                if res:
                    results[item_key] = (*res, attachment_key)
            except (requests.exceptions.RequestException, OSError) as e:
                logger.error("Failed to download PDF for key %s: %s", attachment_key, e)

    return results

download_zotero_pdf(session, user_id, api_key, attachment_key, **kwargs)

Download a PDF from Zotero by attachment key.

Parameters:

Name Type Description Default
session Session

requests.Session for HTTP requests.

required
user_id str

Zotero user ID.

required
api_key str

Zotero API key.

required
attachment_key str

Zotero attachment item key.

required
kwargs

timeout (int): Request timeout in seconds (default: 10). chunk_size (int, optional): Chunk size for streaming.

{}

Returns:

Type Description
Optional[Tuple[str, str]]

Tuple of (local_file_path, filename) if successful, else None.

Source code in aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def download_zotero_pdf(
    session: requests.Session,
    user_id: str,
    api_key: str,
    attachment_key: str,
    **kwargs,
) -> Optional[Tuple[str, str]]:
    """
    Download a PDF from Zotero by attachment key.

    Args:
        session: requests.Session for HTTP requests.
        user_id: Zotero user ID.
        api_key: Zotero API key.
        attachment_key: Zotero attachment item key.
        kwargs:
            timeout (int): Request timeout in seconds (default: 10).
            chunk_size (int, optional): Chunk size for streaming.

    Returns:
        Tuple of (local_file_path, filename) if successful, else None.
    """
    # Extract optional parameters
    timeout = kwargs.get("timeout", 10)
    chunk_size = kwargs.get("chunk_size")
    # Log configured parameters for verification
    logger.info("download_zotero_pdf params -> timeout=%s, chunk_size=%s", timeout, chunk_size)
    # Log download start
    logger.info(
        "Downloading Zotero PDF for attachment %s from Zotero API", attachment_key
    )
    zotero_pdf_url = (
        f"https://api.zotero.org/users/{user_id}/items/" f"{attachment_key}/file"
    )
    headers = {"Zotero-API-Key": api_key}

    try:
        response = session.get(
            zotero_pdf_url, headers=headers, stream=True, timeout=timeout
        )
        response.raise_for_status()

        # Download to a temporary file first
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            for chunk in response.iter_content(chunk_size=chunk_size):
                temp_file.write(chunk)
            temp_file_path = temp_file.name
        # Temp file written to %s
        logger.info("Zotero PDF downloaded to temporary file: %s", temp_file_path)

        # Determine filename from Content-Disposition header or default
        if "filename=" in response.headers.get("Content-Disposition", ""):
            filename = (
                response.headers.get("Content-Disposition", "")
                .split("filename=")[-1]
                .strip('"')
            )
        else:
            filename = "downloaded.pdf"

        return temp_file_path, filename

    except (requests.exceptions.RequestException, OSError) as e:
        logger.error(
            "Failed to download Zotero PDF for attachment %s: %s", attachment_key, e
        )
        return None