Skip to content

PrimeKG

Class for loading PrimeKG dataset.

PrimeKG

Bases: Dataset

Class for loading PrimeKG dataset. It downloads the data from the Harvard Dataverse and stores it in the local directory. The data is then loaded into pandas DataFrame of nodes and edges.

Source code in aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class PrimeKG(Dataset):
    """
    Class for loading PrimeKG dataset.
    It downloads the data from the Harvard Dataverse and stores it in the local directory.
    The data is then loaded into pandas DataFrame of nodes and edges.
    """

    def __init__(self, local_dir: str = "../../../data/primekg/"):
        """
        Constructor for PrimeKG class.

        Args:
            local_dir (str): The local directory where the data will be stored.
        """
        self.name: str = "primekg"
        self.server_path: str = "https://dataverse.harvard.edu/api/access/datafile/"
        self.file_ids: dict = {"nodes": 6180617, "edges": 6180616}
        self.local_dir: str = local_dir

        # Attributes to store the data
        self.nodes: pd.DataFrame = None
        self.edges: pd.DataFrame = None

        # Set up the dataset
        self.setup()

    def setup(self):
        """
        A method to set up the dataset.
        """
        # Make the directory if it doesn't exist
        os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)


    def _download_file(self, remote_url:str, local_path: str):
        """
        A helper function to download a file from remote URL to the local directory.

        Args:
            remote_url (str): The remote URL of the file to be downloaded.
            local_path (str): The local path where the file will be saved.
        """
        response = requests.get(remote_url, stream=True, timeout=300)
        response.raise_for_status()
        progress_bar = tqdm(
            total=int(response.headers.get("content-length", 0)),
            unit="iB",
            unit_scale=True,
        )
        with open(local_path, "wb") as file:
            for data in response.iter_content(1024):
                progress_bar.update(len(data))
                file.write(data)
        progress_bar.close()

    def _load_nodes(self) -> pd.DataFrame:
        """
        Private method to load the nodes dataframe of PrimeKG dataset.
        This method downloads the nodes file from the Harvard Dataverse if it does not exist
        in the local directory. Otherwise, it loads the data from the local directory.
        It further processes the dataframe of nodes and returns it.

        Returns:
            The nodes dataframe of PrimeKG dataset.
        """
        local_file = os.path.join(self.local_dir, f"{self.name}_nodes.tsv.gz")
        if os.path.exists(local_file):
            print(f"{local_file} already exists. Loading the data from the local directory.")

            # Load the dataframe from the local directory and assign it to the nodes attribute
            nodes = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
        else:
            print(f"Downloading node file from {self.server_path}{self.file_ids['nodes']}")

            # Download the file from the Harvard Dataverse with designated file_id for node
            self._download_file(f"{self.server_path}{self.file_ids['nodes']}",
                                os.path.join(self.local_dir, "nodes.tab"))

            # Load the downloaded file into a pandas DataFrame
            nodes = pd.read_csv(os.path.join(self.local_dir, "nodes.tab"),
                                     sep="\t", low_memory=False)

            # Further processing of the dataframe
            nodes = nodes[
                ["node_index", "node_name", "node_source", "node_id", "node_type"]
            ]

            # Store compressed dataframe in the local directory
            nodes.to_csv(local_file, index=False, sep="\t", compression="gzip")

        return nodes

    def _load_edges(self, nodes: pd.DataFrame) -> pd.DataFrame:
        """
        Private method to load the edges dataframe of PrimeKG dataset.
        This method downloads the edges file from the Harvard Dataverse if it does not exist
        in the local directory. Otherwise, it loads the data from the local directory.
        It further processes the dataframe of edges and returns it.

        Args:
            nodes (pd.DataFrame): The nodes dataframe of PrimeKG dataset.

        Returns:
            The edges dataframe of PrimeKG dataset.
        """
        local_file = os.path.join(self.local_dir, f"{self.name}_edges.tsv.gz")
        if os.path.exists(local_file):
            print(f"{local_file} already exists. Loading the data from the local directory.")

            # Load the dataframe from the local directory and assign it to the edges attribute
            edges = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
        else:
            print(f"Downloading edge file from {self.server_path}{self.file_ids['edges']}")

            # Download the file from the Harvard Dataverse with designated file_id for edge
            self._download_file(f"{self.server_path}{self.file_ids['edges']}",
                                os.path.join(self.local_dir, "edges.csv"))

            # Load the downloaded file into a pandas DataFrame
            edges = pd.read_csv(os.path.join(self.local_dir, "edges.csv"),
                                     sep=",", low_memory=False)

            # Further processing of the dataframe
            edges = edges.merge(
                nodes, left_on="x_index", right_on="node_index"
            )
            edges.drop(["x_index"], axis=1, inplace=True)
            edges.rename(
                columns={
                    "node_index": "head_index",
                    "node_name": "head_name",
                    "node_source": "head_source",
                    "node_id": "head_id",
                    "node_type": "head_type",
                },
                inplace=True,
            )
            edges = edges.merge(
                nodes, left_on="y_index", right_on="node_index"
            )
            edges.drop(["y_index"], axis=1, inplace=True)
            edges.rename(
                columns={
                    "node_index": "tail_index",
                    "node_name": "tail_name",
                    "node_source": "tail_source",
                    "node_id": "tail_id",
                    "node_type": "tail_type"
                },
                inplace=True,
            )
            edges = edges[
                [
                    "head_index", "head_name", "head_source", "head_id", "head_type",
                    "tail_index", "tail_name", "tail_source", "tail_id", "tail_type",
                    "display_relation", "relation",
                ]
            ]

            # Store compressed dataframe in the local directory
            edges.to_csv(local_file, index=False, sep="\t", compression="gzip")

        return edges

    def load_data(self):
        """
        Load the PrimeKG dataset into pandas DataFrame of nodes and edges.
        """
        print("Loading nodes of PrimeKG dataset ...")
        self.nodes = self._load_nodes()

        print("Loading edges of PrimeKG dataset ...")
        self.edges = self._load_edges(self.nodes)

    def get_nodes(self) -> pd.DataFrame:
        """
        Get the nodes dataframe of PrimeKG dataset.

        Returns:
            The nodes dataframe of PrimeKG dataset.
        """
        return self.nodes

    def get_edges(self) -> pd.DataFrame:
        """
        Get the edges dataframe of PrimeKG dataset.

        Returns:
            The edges dataframe of PrimeKG dataset.
        """
        return self.edges

__init__(local_dir='../../../data/primekg/')

Constructor for PrimeKG class.

Parameters:

Name Type Description Default
local_dir str

The local directory where the data will be stored.

'../../../data/primekg/'
Source code in aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(self, local_dir: str = "../../../data/primekg/"):
    """
    Constructor for PrimeKG class.

    Args:
        local_dir (str): The local directory where the data will be stored.
    """
    self.name: str = "primekg"
    self.server_path: str = "https://dataverse.harvard.edu/api/access/datafile/"
    self.file_ids: dict = {"nodes": 6180617, "edges": 6180616}
    self.local_dir: str = local_dir

    # Attributes to store the data
    self.nodes: pd.DataFrame = None
    self.edges: pd.DataFrame = None

    # Set up the dataset
    self.setup()

_download_file(remote_url, local_path)

A helper function to download a file from remote URL to the local directory.

Parameters:

Name Type Description Default
remote_url str

The remote URL of the file to be downloaded.

required
local_path str

The local path where the file will be saved.

required
Source code in aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def _download_file(self, remote_url:str, local_path: str):
    """
    A helper function to download a file from remote URL to the local directory.

    Args:
        remote_url (str): The remote URL of the file to be downloaded.
        local_path (str): The local path where the file will be saved.
    """
    response = requests.get(remote_url, stream=True, timeout=300)
    response.raise_for_status()
    progress_bar = tqdm(
        total=int(response.headers.get("content-length", 0)),
        unit="iB",
        unit_scale=True,
    )
    with open(local_path, "wb") as file:
        for data in response.iter_content(1024):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()

_load_edges(nodes)

Private method to load the edges dataframe of PrimeKG dataset. This method downloads the edges file from the Harvard Dataverse if it does not exist in the local directory. Otherwise, it loads the data from the local directory. It further processes the dataframe of edges and returns it.

Parameters:

Name Type Description Default
nodes DataFrame

The nodes dataframe of PrimeKG dataset.

required

Returns:

Type Description
DataFrame

The edges dataframe of PrimeKG dataset.

Source code in aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def _load_edges(self, nodes: pd.DataFrame) -> pd.DataFrame:
    """
    Private method to load the edges dataframe of PrimeKG dataset.
    This method downloads the edges file from the Harvard Dataverse if it does not exist
    in the local directory. Otherwise, it loads the data from the local directory.
    It further processes the dataframe of edges and returns it.

    Args:
        nodes (pd.DataFrame): The nodes dataframe of PrimeKG dataset.

    Returns:
        The edges dataframe of PrimeKG dataset.
    """
    local_file = os.path.join(self.local_dir, f"{self.name}_edges.tsv.gz")
    if os.path.exists(local_file):
        print(f"{local_file} already exists. Loading the data from the local directory.")

        # Load the dataframe from the local directory and assign it to the edges attribute
        edges = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
    else:
        print(f"Downloading edge file from {self.server_path}{self.file_ids['edges']}")

        # Download the file from the Harvard Dataverse with designated file_id for edge
        self._download_file(f"{self.server_path}{self.file_ids['edges']}",
                            os.path.join(self.local_dir, "edges.csv"))

        # Load the downloaded file into a pandas DataFrame
        edges = pd.read_csv(os.path.join(self.local_dir, "edges.csv"),
                                 sep=",", low_memory=False)

        # Further processing of the dataframe
        edges = edges.merge(
            nodes, left_on="x_index", right_on="node_index"
        )
        edges.drop(["x_index"], axis=1, inplace=True)
        edges.rename(
            columns={
                "node_index": "head_index",
                "node_name": "head_name",
                "node_source": "head_source",
                "node_id": "head_id",
                "node_type": "head_type",
            },
            inplace=True,
        )
        edges = edges.merge(
            nodes, left_on="y_index", right_on="node_index"
        )
        edges.drop(["y_index"], axis=1, inplace=True)
        edges.rename(
            columns={
                "node_index": "tail_index",
                "node_name": "tail_name",
                "node_source": "tail_source",
                "node_id": "tail_id",
                "node_type": "tail_type"
            },
            inplace=True,
        )
        edges = edges[
            [
                "head_index", "head_name", "head_source", "head_id", "head_type",
                "tail_index", "tail_name", "tail_source", "tail_id", "tail_type",
                "display_relation", "relation",
            ]
        ]

        # Store compressed dataframe in the local directory
        edges.to_csv(local_file, index=False, sep="\t", compression="gzip")

    return edges

_load_nodes()

Private method to load the nodes dataframe of PrimeKG dataset. This method downloads the nodes file from the Harvard Dataverse if it does not exist in the local directory. Otherwise, it loads the data from the local directory. It further processes the dataframe of nodes and returns it.

Returns:

Type Description
DataFrame

The nodes dataframe of PrimeKG dataset.

Source code in aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def _load_nodes(self) -> pd.DataFrame:
    """
    Private method to load the nodes dataframe of PrimeKG dataset.
    This method downloads the nodes file from the Harvard Dataverse if it does not exist
    in the local directory. Otherwise, it loads the data from the local directory.
    It further processes the dataframe of nodes and returns it.

    Returns:
        The nodes dataframe of PrimeKG dataset.
    """
    local_file = os.path.join(self.local_dir, f"{self.name}_nodes.tsv.gz")
    if os.path.exists(local_file):
        print(f"{local_file} already exists. Loading the data from the local directory.")

        # Load the dataframe from the local directory and assign it to the nodes attribute
        nodes = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
    else:
        print(f"Downloading node file from {self.server_path}{self.file_ids['nodes']}")

        # Download the file from the Harvard Dataverse with designated file_id for node
        self._download_file(f"{self.server_path}{self.file_ids['nodes']}",
                            os.path.join(self.local_dir, "nodes.tab"))

        # Load the downloaded file into a pandas DataFrame
        nodes = pd.read_csv(os.path.join(self.local_dir, "nodes.tab"),
                                 sep="\t", low_memory=False)

        # Further processing of the dataframe
        nodes = nodes[
            ["node_index", "node_name", "node_source", "node_id", "node_type"]
        ]

        # Store compressed dataframe in the local directory
        nodes.to_csv(local_file, index=False, sep="\t", compression="gzip")

    return nodes

get_edges()

Get the edges dataframe of PrimeKG dataset.

Returns:

Type Description
DataFrame

The edges dataframe of PrimeKG dataset.

Source code in aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
194
195
196
197
198
199
200
201
def get_edges(self) -> pd.DataFrame:
    """
    Get the edges dataframe of PrimeKG dataset.

    Returns:
        The edges dataframe of PrimeKG dataset.
    """
    return self.edges

get_nodes()

Get the nodes dataframe of PrimeKG dataset.

Returns:

Type Description
DataFrame

The nodes dataframe of PrimeKG dataset.

Source code in aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
185
186
187
188
189
190
191
192
def get_nodes(self) -> pd.DataFrame:
    """
    Get the nodes dataframe of PrimeKG dataset.

    Returns:
        The nodes dataframe of PrimeKG dataset.
    """
    return self.nodes

load_data()

Load the PrimeKG dataset into pandas DataFrame of nodes and edges.

Source code in aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
175
176
177
178
179
180
181
182
183
def load_data(self):
    """
    Load the PrimeKG dataset into pandas DataFrame of nodes and edges.
    """
    print("Loading nodes of PrimeKG dataset ...")
    self.nodes = self._load_nodes()

    print("Loading edges of PrimeKG dataset ...")
    self.edges = self._load_edges(self.nodes)

setup()

A method to set up the dataset.

Source code in aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
37
38
39
40
41
42
def setup(self):
    """
    A method to set up the dataset.
    """
    # Make the directory if it doesn't exist
    os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)