datarobot
diff --git a/‎custom_model_runner/CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions b/‎custom_model_runner/CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎custom_model_runner/README.md‎
Lines changed: 6 additions & 10 deletions b/‎custom_model_runner/README.md‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎custom_model_runner/datarobot_drum/drum/adapters/model_adapters/python_model_adapter.py‎
Lines changed: 0 additions & 4 deletions b/‎custom_model_runner/datarobot_drum/drum/adapters/model_adapters/python_model_adapter.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎custom_model_runner/datarobot_drum/drum/common.py‎
Lines changed: 0 additions & 17 deletions b/‎custom_model_runner/datarobot_drum/drum/common.py‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎custom_model_runner/datarobot_drum/drum/enum.py‎
Lines changed: 0 additions & 4 deletions b/‎custom_model_runner/datarobot_drum/drum/enum.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎custom_model_runner/datarobot_drum/drum/root_predictors/predict_mixin.py‎
Lines changed: 6 additions & 38 deletions b/‎custom_model_runner/datarobot_drum/drum/root_predictors/predict_mixin.py‎
Lines changed: 6 additions & 38 deletions
diff --git a/‎custom_model_runner/datarobot_drum/drum/root_predictors/transform_helpers.py‎
Lines changed: 0 additions & 34 deletions b/‎custom_model_runner/datarobot_drum/drum/root_predictors/transform_helpers.py‎
Lines changed: 0 additions & 34 deletions
diff --git a/‎custom_model_runner/datarobot_drum/drum/utils/structured_input_read_utils.py‎
Lines changed: 0 additions & 17 deletions b/‎custom_model_runner/datarobot_drum/drum/utils/structured_input_read_utils.py‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎custom_model_runner/drum_server_api.yaml‎
Lines changed: 1 addition & 9 deletions b/‎custom_model_runner/drum_server_api.yaml‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎custom_model_runner/requirements.txt‎
Lines changed: 0 additions & 2 deletions b/‎custom_model_runner/requirements.txt‎
Lines changed: 0 additions & 2 deletions
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+#### [1.16.1] - In Progress
+##### Changed
+- Remove support for Apache Arrow.
+
 #### [1.16.0] - 2025-01-08
 ##### Changed
 - Remove 'mlpiper' dependency and replace its functionality with comparable built-in implementations.
 
@@ -289,28 +289,24 @@ Example: POST http://localhost:6789/predict/; POST http://localhost:6789/predict
 For these routes data can be posted in two ways:
   * as form data parameter with a <key:value> pair, where:  
 key = X  
-value = filename of the `csv/arrow/mtx` format, that contains the inference data.
-  * as binary data; in case of `arrow` or `mtx` formats, mimetype `application/x-apache-arrow-stream` or `text/mtx` must be set.
+value = filename of the `csv/mtx` format, that contains the inference data.
+  * as binary data; in case of `mtx` format, mimetype `text/mtx` must be set.
 
 * Structured transform route (for Python predictor only):   
 A POST **URL_PREFIX/transform/** route, which returns transformed data.  
 Example: POST http://localhost:6789/transform/;  
 For this route data can be posted in two ways:
   * as form data parameter with a <key:value> pair, where:  
 key = `X`.  
-value = filename of the `csv/arrow/mtx` format, that contains the inference data.
+value = filename of the `csv/mtx` format, that contains the inference data.
 
     optionally a second key, `y`, can be passed with value = a second filename containing target data. 
 
     if `y` is passed, the route will return both `X.transformed` and `y.transformed` keys, along with `out.format`
-     indicating the format of the transformed X output. This will take a value of `csv`, 
-    `sparse` or `arrow`. `y.transformed` is never sparse.
-    
-    an `arrow_version` key may also be passed if you desire to use `arrow` format for `X.transformed` or `y.transformed`.
-    this is used to ensure that the endpoint returns data that can be opened by the caller's version of arrow. without this
-    key, all dense data returned will default to csv format.
+     indicating the format of the transformed X output. This will take a value of `csv` or `sparse`.
+     `y.transformed` is never sparse.
 
-  * as binary data; in case of `arrow` or `mtx` formats, mimetype `application/x-apache-arrow-stream` or `text/mtx` must be set.
+  * as binary data; in case of `mtx` format, mimetype `text/mtx` must be set.
 
 
 * Unstructured predictions routes:  
 
@@ -28,7 +28,6 @@
 from datarobot_drum.drum.artifact_predictors.onnx_predictor import ONNXPredictor
 
 from datarobot_drum.drum.common import (
-    get_pyarrow_module,
     reroute_stdout_to_stderr,
     SupportedPayloadFormats,
 )
@@ -428,9 +427,6 @@ def supported_payload_formats(self):
         formats = SupportedPayloadFormats()
         formats.add(PayloadFormat.CSV)
         formats.add(PayloadFormat.MTX)
-        pa = get_pyarrow_module()
-        if pa is not None:
-            formats.add(PayloadFormat.ARROW, pa.__version__)
         return formats
 
     def model_info(self):
 
@@ -68,7 +68,6 @@ def __init__(self):
             PredictionServerMimetypes.TEXT_CSV: PayloadFormat.CSV,
             PredictionServerMimetypes.TEXT_PLAIN: PayloadFormat.CSV,
             PredictionServerMimetypes.TEXT_MTX: PayloadFormat.MTX,
-            PredictionServerMimetypes.APPLICATION_X_APACHE_ARROW_STREAM: PayloadFormat.ARROW,
         }
 
     def add(self, payload_format, format_version=None):
@@ -86,22 +85,6 @@ def __iter__(self):
             yield payload_format, format_version
 
 
-try:
-    import pyarrow
-except ImportError:
-    pyarrow = None
-
-
-def get_pyarrow_module():
-    return pyarrow
-
-
-def verify_pyarrow_module():
-    if pyarrow is None:
-        raise ModuleNotFoundError("Please install pyarrow to support Arrow format")
-    return pyarrow
-
-
 def to_bool(value):
     if value is None:
         return False
 
@@ -152,15 +152,13 @@ class PredictionServerMimetypes:
     APPLICATION_JSON = "application/json"
     APPLICATION_OCTET_STREAM = "application/octet-stream"
     TEXT_PLAIN = "text/plain"
-    APPLICATION_X_APACHE_ARROW_STREAM = "application/x-apache-arrow-stream"
     TEXT_MTX = "text/mtx"
     TEXT_CSV = "text/csv"
     EMPTY = ""
 
 
 class InputFormatExtension:
     MTX = ".mtx"
-    ARROW = ".arrow"
     CSV = ".csv"
 
 
@@ -181,7 +179,6 @@ class ModelInfoKeys:
 
 InputFormatToMimetype = {
     InputFormatExtension.MTX: PredictionServerMimetypes.TEXT_MTX,
-    InputFormatExtension.ARROW: PredictionServerMimetypes.APPLICATION_X_APACHE_ARROW_STREAM,
 }
 
 
@@ -413,7 +410,6 @@ class EnvVarNames:
 
 class PayloadFormat:
     CSV = "csv"
-    ARROW = "arrow"
     MTX = "mtx"
 
 
 
@@ -30,7 +30,6 @@
 )
 from datarobot_drum.drum.root_predictors.transform_helpers import (
     is_sparse,
-    make_arrow_payload,
     make_csv_payload,
     make_mtx_payload,
 )
@@ -74,7 +73,7 @@ def _fetch_data_from_request(file_key, logger=None):
         else:
             wrong_key_error_message = (
                 "Samples should be provided as: "
-                "  - a csv, mtx, or arrow file under `{}` form-data param key."
+                "  - a csv or mtx under `{}` form-data param key."
                 "  - binary data".format(file_key)
             )
             if logger is not None:
@@ -126,8 +125,6 @@ def _check_mimetype_support(self, mimetype):
                 )
                 + "Make DRUM support the format or implement `read_input_data` hook to read the data. "
             )
-            if mimetype == PredictionServerMimetypes.APPLICATION_X_APACHE_ARROW_STREAM:
-                error_message += "pyarrow package may be missing, try to install."
             return {"message": error_message}, HTTP_422_UNPROCESSABLE_ENTITY
         return None
 
@@ -193,16 +190,6 @@ def _build_drum_response_json_str(predict_response):
     def _transform(self, logger=None):
         response_status = HTTP_200_OK
 
-        arrow_key = "arrow_version"
-        arrow_version = request.files.get(arrow_key)
-        # TODO: check implementation of how arrow_version is passed
-        # Currently it is passed as a file content,
-        # so arrow_version is of type werkzeug.datastructures.FileStorage,
-        # that's why io.BytesIO getvalue is called on it.
-        if arrow_version is not None:
-            arrow_version = arrow_version.getvalue().decode("utf-8")
-        use_arrow = arrow_version is not None
-
         try:
             feature_binary_data, feature_mimetype, feature_charset = self._fetch_data_from_request(
                 "X", logger=logger
@@ -256,32 +243,13 @@ def _transform(self, logger=None):
 
         # make output
         if is_sparse(out_data):
-            if use_arrow:
-                target_payload = (
-                    make_arrow_payload(out_target, arrow_version)
-                    if out_target is not None
-                    else None
-                )
-                target_out_format = "arrow"
-            else:
-                target_payload = make_csv_payload(out_target) if out_target is not None else None
-                target_out_format = "csv"
+            target_payload = make_csv_payload(out_target) if out_target is not None else None
             feature_payload, colnames = make_mtx_payload(out_data)
             out_format = "sparse"
         else:
-            if use_arrow:
-                feature_payload = make_arrow_payload(out_data, arrow_version)
-                target_payload = (
-                    make_arrow_payload(out_target, arrow_version)
-                    if out_target is not None
-                    else None
-                )
-                out_format = "arrow"
-            else:
-                feature_payload = make_csv_payload(out_data)
-                target_payload = make_csv_payload(out_target) if out_target is not None else None
-                out_format = "csv"
-            target_out_format = out_format
+            feature_payload = make_csv_payload(out_data)
+            target_payload = make_csv_payload(out_target) if out_target is not None else None
+            out_format = "csv"
 
         out_fields = {
             "X.format": out_format,
@@ -306,7 +274,7 @@ def _transform(self, logger=None):
         if target_payload is not None:
             out_fields.update(
                 {
-                    "y.format": target_out_format,
+                    "y.format": "csv",
                     Y_TRANSFORM_KEY: (
                         Y_TRANSFORM_KEY,
                         target_payload,
 
@@ -16,7 +16,6 @@
 from scipy.sparse import issparse
 from scipy.sparse.csr import csr_matrix
 
-from datarobot_drum.drum.common import verify_pyarrow_module
 from datarobot_drum.drum.enum import X_FORMAT_KEY, X_TRANSFORM_KEY
 
 
@@ -73,30 +72,6 @@ def validate_and_convert_column_names_for_serialization(df):
     return df
 
 
-def make_arrow_payload(df, arrow_version):
-    pa = verify_pyarrow_module()
-    df = validate_and_convert_column_names_for_serialization(df)
-
-    pyarrow_available_version = version.parse(pa.__version__)
-    pyarrow_requested_version = version.parse(arrow_version)
-    pyarrow_0_20_version = version.parse("0.20")
-
-    if (
-        pyarrow_requested_version != pyarrow_available_version
-        and pyarrow_requested_version < pyarrow_0_20_version
-    ):
-        batch = pa.RecordBatch.from_pandas(df, nthreads=None, preserve_index=False)
-        sink = pa.BufferOutputStream()
-        options = pa.ipc.IpcWriteOptions(
-            metadata_version=pa.MetadataVersion.V4, use_legacy_format=True
-        )
-        with pa.RecordBatchStreamWriter(sink, batch.schema, options=options) as writer:
-            writer.write_batch(batch)
-        return sink.getvalue().to_pybytes()
-    else:
-        return pa.ipc.serialize_pandas(df, preserve_index=False).to_pybytes()
-
-
 def make_csv_payload(df):
     df = validate_and_convert_column_names_for_serialization(df)
 
@@ -107,14 +82,6 @@ def make_csv_payload(df):
     return s_buf.getvalue()[:-2].encode("utf-8")
 
 
-def read_arrow_payload(response_dict, transform_key):
-    pa = verify_pyarrow_module()
-
-    bytes = response_dict[transform_key]
-    df = pa.ipc.deserialize_pandas(bytes)
-    return df
-
-
 def read_csv_payload(response_dict, transform_key):
     bytes = response_dict[transform_key]
     return pd.read_csv(BytesIO(bytes))
@@ -159,7 +126,6 @@ def _sparse(data, key):
         return pd.DataFrame.sparse.from_spmatrix(read_mtx_payload(data, key))
 
     reader = {
-        "arrow": read_arrow_payload,
         "sparse": _sparse,
         "csv": read_csv_payload,
     }
 
@@ -12,7 +12,6 @@
 import pandas as pd
 from scipy.io import mmread
 
-from datarobot_drum.drum.common import get_pyarrow_module
 from datarobot_drum.drum.enum import (
     InputFormatToMimetype,
     PredictionServerMimetypes,
@@ -67,22 +66,6 @@ def read_structured_input_data_as_df(binary_data, mimetype, sparse_colnames=None
                 return pd.DataFrame.sparse.from_spmatrix(
                     mmread(io.BytesIO(binary_data)), columns=sparse_colnames
                 )
-            elif mimetype == PredictionServerMimetypes.APPLICATION_X_APACHE_ARROW_STREAM:
-                df = get_pyarrow_module().ipc.deserialize_pandas(binary_data)
-
-                # After CSV serialization+deserialization,
-                # original dataframe's None and np.nan values
-                # become np.nan values.
-                # After Arrow serialization+deserialization,
-                # original dataframe's None and np.nan values
-                # become np.nan for numeric columns and None for 'object' columns.
-                #
-                # Since we are supporting both CSV and Arrow,
-                # to be consistent with CSV serialization/deserialization,
-                # it is required to replace all None with np.nan for Arrow.
-                df.fillna(value=np.nan, inplace=True)
-
-                return df
             else:  # CSV format
                 try:
                     df = pd.read_csv(io.BytesIO(binary_data))
 
@@ -218,8 +218,6 @@ paths:
                     type: object
                     description: If format is supported, property present in the object. Property's value is a package version. If version is not pinned, value is null.
                     properties:
-                      arrow:
-                        type: string
                       csv:
                         type: string
                       mtx:
@@ -232,7 +230,6 @@ paths:
                         type: boolean
               example:
                 supported_payload_formats:
-                  arrow: 2.0.0
                   csv: null
                   mtx: null
   /URL_PREFIX/predict/:
@@ -256,11 +253,6 @@ paths:
               description: Scoring data.
               type: string
               format: text
-          application/x-apache-arrow-stream:
-            schema:
-              description: Scoring data.
-              type: string
-              format: binary
           multipart/form-data:
             schema:
               description: Scoring data.
@@ -290,7 +282,7 @@ paths:
                     type: string
                     description: Status message
               example:
-                message: "ERROR: Samples should be provided as:   - a csv, mtx, or arrow file under `X` form-data param key.  - binary data."
+                message: "ERROR: Samples should be provided as:   - a csv or mtx under `X` form-data param key.  - binary data."
   /URL_PREFIX/predictions/:
     $ref: "#/paths/~1URL_PREFIX~1predict~1"
 
 
@@ -14,8 +14,6 @@ strictyaml==1.4.2
 PyYAML
 texttable
 py4j~=0.10.9.0
-# only constrained by other packages, not DRUM
-pyarrow
 Pillow
 # constrained by Julia env
 julia<=0.5.7