Source code for graphscope.framework.loader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2020 Alibaba Group Holding Limited. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import json
import logging
import pathlib
from typing import Dict
from typing import Sequence
from typing import Tuple
from urllib.parse import urlparse

import numpy as np
import pandas as pd
import pyarrow as pa

from graphscope.framework import utils
from graphscope.framework.errors import check_argument
from graphscope.proto import attr_value_pb2
from graphscope.proto import types_pb2

try:
    import vineyard
except (ImportError, TypeError):
    vineyard = None

logger = logging.getLogger("graphscope")


class CSVOptions(object):
    """Options to read from CSV files.
    Avaiable options are:
        - column delimiters
        - include a subset of columns
        - types of each columns
        - whether the file contains a header
    """

    def __init__(self) -> None:
        # Field delimiter
        self.delimiter = ","

        # If non-empty, indicates the names of columns from the CSV file that should
        # be actually read and converted (in the list's order).
        # Columns not in this list will be ignored.
        self.include_columns = []
        # Optional per-column types (disabling type inference on those columns)
        self.column_types = []
        # include_columns always contains id column for v, src id and dst id column for e
        # if it contains and only contains those id columns, we suppose user actually want to
        # read all other properties. (Otherwise they should specify at least one property)
        self.force_include_all = False

        # If true, column names will be read from the first CSV row
        # If false, column names will be of the form "f0", "f1"...
        self.header_row = True
        self.filetype = "CSV"

    def to_dict(self) -> Dict:
        options = {}
        options["delimiter"] = self.delimiter
        options["header_row"] = self.header_row
        if self.include_columns:
            options["schema"] = ",".join(self.include_columns)
        if self.column_types:
            cpp_types = [utils.data_type_to_cpp(dt) for dt in self.column_types]
            options["column_types"] = ",".join(cpp_types)
        if self.force_include_all:
            options["include_all_columns"] = self.force_include_all
        options["filetype"] = self.filetype
        return options

    def __str__(self) -> str:
        return "&".join(["{}={}".format(k, v) for k, v in self.to_dict().items()])

    def __repr__(self) -> str:
        return self.__str__()


[docs]class Loader(object):
    """Generic data source wrapper.
    Loader can take various data sources, and assemble necessary information into a AttrValue.
    """

[docs]    def __init__(
        self, source, delimiter=",", sep=",", header_row=True, filetype="CSV", **kwargs
    ):
        """Initialize a loader with configurable options.
        Note: Loader cannot be reused since it may change inner state when constructing
        information for loading a graph.

        Args:
            source (str or value):
                The data source to be load, which could be one of the followings:

                    * local file: specified by URL :code:`file://...`
                    * oss file: specified by URL :code:`oss://...`
                    * hdfs file: specified by URL :code:`hdfs://...`
                    * s3 file: specified by URL :code:`s3://...`
                    * numpy ndarray, in CSR format
                    * pandas dataframe

                Ordinary data sources can be loaded using vineyard stream as well, a :code:`vineyard://`
                prefix can be used in the URL then the local file, oss object or HDFS file will be loaded
                into a vineyard stream first, then GraphScope's fragment will be built upon those streams
                in vineyard.

                Once the stream IO in vineyard reaches a stable state, it will be the default mode to
                load data sources and construct fragments in GraphScope.

            delimiter (char, optional): Column delimiter. Defaults to ','

            header_row (bool, optional): Whether source have a header. If true, column names
                will be read from the first row of source, else they are named by 'f0', 'f1', ....
                Defaults to True.

            filetype (str, optional): Specify the type of files to load, can be "CSV", "ORC", and
                "PARQUET". Default is "CSV".

        Notes:
            Data is resolved by drivers in `vineyard <https://github.com/v6d-io/v6d>`_ .
            See more additional info in `Loading Graph` section of Docs, and implementations in `vineyard`.
        """
        self.protocol = ""
        # For numpy or pandas, source is the serialized raw bytes
        # For files, it's the location
        # For vineyard, it's the ID or name
        self.source = ""
        # options for data source is csv
        self.options = CSVOptions()
        check_argument(
            isinstance(delimiter, str) and len(delimiter) == 1,
            "The delimiter must be a single character, cannot be '%s'" % delimiter,
        )
        self.options.delimiter = delimiter
        self.options.header_row = header_row
        self.options.filetype = filetype
        # meta for data source is numpy or dataframe
        self.deduced_properties = None
        # extra args directly passed to storage system
        # find more details in fsspec
        #   https://filesystem-spec.readthedocs.io/en/latest/
        self.storage_options = kwargs
        # also parse protocol and source in `resolve` method
        self.resolve(source)

    def __str__(self) -> str:
        return "{}: {}".format(self.protocol, self.source)

    def __repr__(self) -> str:
        return self.__str__()

    def resolve(self, source):
        """Dispatch resolver based on type of souce.

        Args:
            source: Different data sources

        Raises:
            RuntimeError: If the source is a not supported type.
        """
        if isinstance(source, str):
            self.process_location(source)
        elif isinstance(source, pathlib.Path):
            self.process_location(str(source))
        elif isinstance(source, pd.DataFrame):
            self.process_pandas(source)
        elif vineyard is not None and isinstance(
            source, (vineyard.Object, vineyard.ObjectID, vineyard.ObjectName)
        ):
            self.process_vy_object(source)
        elif isinstance(source, Sequence):
            # Assume a list of numpy array are passed as COO matrix, with length >= 2.
            # Formats: [src_id, dst_id, prop_1, ..., prop_n]
            check_argument(all([isinstance(item, np.ndarray) for item in source]))
            self.process_numpy(source)
        else:
            raise RuntimeError("Not support source", source)

    def process_location(self, source):
        self.protocol = urlparse(source).scheme
        # If protocol is not set, use 'file' as default
        if not self.protocol:
            self.protocol = "file"
        self.source = source

    def process_numpy(self, source: Sequence[np.ndarray]):
        """Transform arrays to equivalent DataFrame,
        note the transpose is necessary.
        """
        col_names = ["f%s" % i for i in range(len(source))]
        df = pd.DataFrame(source, col_names).T
        types = {}
        for i, _ in enumerate(source):
            types[col_names[i]] = source[i].dtype
        df = df.astype(types)
        return self.process_pandas(df)

    def process_pandas(self, source: pd.DataFrame):
        self.protocol = "pandas"
        col_names = list(source.columns.values)
        col_types = [utils._from_numpy_dtype(dtype) for dtype in source.dtypes.values]

        table = pa.Table.from_pandas(source, preserve_index=False)
        sink = pa.BufferOutputStream()
        with pa.ipc.new_stream(sink, table.schema) as writer:
            writer.write_table(table)
        buf = sink.getvalue()

        self.deduced_properties = list(zip(col_names, col_types))
        self.source = bytes(memoryview(buf))

    def process_vy_object(self, source):
        self.protocol = "vineyard"
        # encoding: add a `o` prefix to object id, and a `s` prefix to object name.
        if isinstance(source, vineyard.Object):
            self.source = "o%s" % repr(source.id)
        elif isinstance(source, vineyard.ObjectID):
            self.source = "o%s" % repr(source)
        elif isinstance(source, vineyard.ObjectName):
            self.source = "s%s" % str(source)
        else:
            raise ValueError(
                "Invalid input source: not a vineyard's Object, ObjectID or ObjectName"
            )

    def select_columns(self, columns: Sequence[Tuple[str, int]], include_all=False):
        self.options.include_columns = []
        self.options.column_types = []
        for name, data_type in columns:
            self.options.include_columns.append(name)
            self.options.column_types.append(data_type)
        self.options.force_include_all = include_all

    def get_attr(self):
        config = {}
        config[types_pb2.PROTOCOL] = utils.s_to_attr(self.protocol)
        # Let graphscope handle local files cause it's implemented in c++ and
        # doesn't add an additional stream layer.
        # Maybe handled by vineyard in the near future
        if self.protocol == "file":
            if (
                self.source.endswith(".orc")
                or self.source.endswith(".parquet")
                or self.source.endswith(".pq")
                or str(self.options.filetype).upper() in ["ORC", "PARQUET"]
            ):
                # orc and parquet: handled by vineyard
                config[types_pb2.SOURCE] = utils.s_to_attr(self.source)
                config[types_pb2.STORAGE_OPTIONS] = utils.s_to_attr(
                    json.dumps(self.storage_options)
                )
                config[types_pb2.READ_OPTIONS] = utils.s_to_attr(
                    json.dumps(self.options.to_dict())
                )
            else:
                source = "{}#{}".format(self.source, self.options)
                config[types_pb2.SOURCE] = utils.s_to_attr(source)
        elif self.protocol == "pandas":
            config[types_pb2.VALUES] = self.source
            config[types_pb2.STORAGE_OPTIONS] = utils.s_to_attr(
                json.dumps(self.storage_options)
            )
            config[types_pb2.READ_OPTIONS] = utils.s_to_attr(json.dumps({}))
        else:  # Let vineyard handle other data source.
            config[types_pb2.SOURCE] = utils.s_to_attr(self.source)
            config[types_pb2.STORAGE_OPTIONS] = utils.s_to_attr(
                json.dumps(self.storage_options)
            )
            config[types_pb2.READ_OPTIONS] = utils.s_to_attr(
                json.dumps(self.options.to_dict())
            )
        return config