Skip to content

Paper Loader

Paper loading utilities for managing PDF documents in vector store.

load_all_papers(vector_store, articles, call_id, config, has_gpu)

Ensure all papers from article_data are loaded into the Milvus vector store. Optimized for GPU/CPU processing.

Parameters:

Name Type Description Default
vector_store Any

The Vectorstore instance

required
articles Dict[str, Any]

Dictionary of article data

required
call_id str

Call identifier for logging

required
config Any

Configuration object

required
has_gpu bool

Whether GPU is available

required
Source code in aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def load_all_papers(
    vector_store: Any,  # The Vectorstore instance
    articles: Dict[str, Any],
    call_id: str,
    config: Any,
    has_gpu: bool,
) -> None:
    """
    Ensure all papers from article_data are loaded into the Milvus vector store.
    Optimized for GPU/CPU processing.

    Args:
        vector_store: The Vectorstore instance
        articles: Dictionary of article data
        call_id: Call identifier for logging
        config: Configuration object
        has_gpu: Whether GPU is available
    """
    papers_to_load = []
    skipped_papers = []
    already_loaded = []

    # Check which papers need to be loaded
    for pid, article_info in articles.items():
        if pid not in vector_store.loaded_papers:
            pdf_url = article_info.get("pdf_url")
            if pdf_url:
                # Prepare tuple for batch loading
                papers_to_load.append((pid, pdf_url, article_info))
            else:
                skipped_papers.append(pid)
        else:
            already_loaded.append(pid)

    # Log summary of papers status with hardware info
    hardware_info = f" (GPU acceleration: {'enabled' if has_gpu else 'disabled'})"
    logger.info(
        "%s: Paper loading summary%s - Total: %d, Already loaded: %d, To load: %d, No PDF: %d",
        call_id,
        hardware_info,
        len(articles),
        len(already_loaded),
        len(papers_to_load),
        len(skipped_papers),
    )

    if skipped_papers:
        logger.warning(
            "%s: Skipping %d papers without PDF URLs: %s%s",
            call_id,
            len(skipped_papers),
            skipped_papers[:5],  # Show first 5
            "..." if len(skipped_papers) > 5 else "",
        )

    if not papers_to_load:
        logger.info("%s: All papers with PDFs are already loaded in Milvus", call_id)
        return

    # Use batch loading with parallel processing for ALL papers at once
    # Adjust parameters based on hardware capabilities
    if has_gpu:
        # GPU can handle more parallel processing
        max_workers = min(12, max(4, len(papers_to_load)))  # More workers for GPU
        batch_size = config.get("embedding_batch_size", 2000)  # Larger batches for GPU
        logger.info(
            "%s: Using GPU-optimized loading parameters: %d workers, batch size %d",
            call_id,
            max_workers,
            batch_size,
        )
    else:
        # CPU - more conservative parameters
        max_workers = min(8, max(3, len(papers_to_load)))  # Conservative for CPU
        batch_size = config.get("embedding_batch_size", 1000)  # Smaller batches for CPU
        logger.info(
            "%s: Using CPU-optimized loading parameters: %d workers, batch size %d",
            call_id,
            max_workers,
            batch_size,
        )

    logger.info(
        "%s: Loading %d papers in ONE BATCH using %d parallel workers (batch size: %d, %s)",
        call_id,
        len(papers_to_load),
        max_workers,
        batch_size,
        "GPU accelerated" if has_gpu else "CPU processing",
    )

    # This should process ALL papers at once with hardware optimization
    add_papers_batch(
        papers_to_add=papers_to_load,
        vector_store=vector_store.vector_store,  # Pass the LangChain vector store
        loaded_papers=vector_store.loaded_papers,
        paper_metadata=vector_store.paper_metadata,
        documents=vector_store.documents,
        config=vector_store.config,
        metadata_fields=vector_store.metadata_fields,
        has_gpu=vector_store.has_gpu,
        max_workers=max_workers,
        batch_size=batch_size,
    )

    logger.info(
        "%s: Successfully completed batch loading of all %d papers with %s",
        call_id,
        len(papers_to_load),
        "GPU acceleration" if has_gpu else "CPU processing",
    )