diff --git a/README.md b/README.md index c6b882c..6e37440 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,7 @@ Note that you need to uninstall with the same flags: `nbstripout` can be used to rewrite an existing Git repository using [`git filter-repo`](https://github.com/newren/git-filter-repo) to strip output from existing notebooks. This invocation operates on all `.ipynb` files in the repo: + ```bash #!/bin/bash git-filter-repo \ @@ -324,10 +325,36 @@ Do not strip the output, only metadata: nbstripout --keep-output +#### Output Types + +When keeping the output, drop a specific +[`output_type`](https://ipython.readthedocs.io/en/3.x/notebook/nbformat.html#code-cell-outputs), +like `error` or `stream` + + nbstripout --drop-output-type error stream + +Drop all output except specific output types: + + nbstripout --keep-output-type execute_result + +_**Note: `--keep-output-type` will override `--max-size` for outputs that match.**_ + +For stripping certain outputs that have names (like `stream` which can be +`stderr` or `stdout`) you can use a colon to specify the name. The following +would strip all `stderr` output. + + nbstripout --drop-output-type stream:stderr + +`stream:stdout` would strip all `stdout` output, including print statements. + +#### Cell IDs + Do not reassign the cell ids to be sequential (which is the default behavior): nbstripout --keep-id +#### Keeping Output on Specific Cells + To mark special cells so that the output is not stripped, you can either: 1. Set the `keep_output` tag on the cell. To do this, enable the tags toolbar diff --git a/nbstripout/_nbstripout.py b/nbstripout/_nbstripout.py index 3578828..501898b 100644 --- a/nbstripout/_nbstripout.py +++ b/nbstripout/_nbstripout.py @@ -372,6 +372,8 @@ def process_jupyter_notebook( drop_empty_cells=args.drop_empty_cells, drop_tagged_cells=args.drop_tagged_cells.split(), strip_init_cells=args.strip_init_cells, + drop_output_types=set(args.drop_output_type), + keep_output_types = set(args.keep_output_type), max_size=_parse_size(args.max_size), ) @@ -451,6 +453,8 @@ def main(): ) parser.add_argument('--keep-count', action='store_true', help='Do not strip the execution count/prompt number') parser.add_argument('--keep-output', action='store_true', help='Do not strip output', default=None) + parser.add_argument('--drop-output-type', help='Types of output cells to drop, e.g. "error" or "stream". Only has effect with --keep-output', nargs='+', default=[]) + parser.add_argument('--keep-output-type', help='Types of output cells to keep, e.g. "error" or "stream". Will take effect without --keep-output', nargs='+', default=[]) parser.add_argument( '--keep-id', action='store_true', diff --git a/nbstripout/_utils.py b/nbstripout/_utils.py index 3ebde52..0f9d652 100644 --- a/nbstripout/_utils.py +++ b/nbstripout/_utils.py @@ -1,6 +1,6 @@ from collections import defaultdict import sys -from typing import Any, Callable, Iterator, List, Optional +from typing import Any, Callable, Iterator, List, Optional, Set, Dict from nbformat import NotebookNode @@ -94,6 +94,25 @@ def strip_zeppelin_output(nb: dict) -> dict: cell['results'] = {} return nb +def match_output_type(output: Dict, + output_type: str) -> bool: + """ + Take the `output_type` string, and return whether the output matches. + + Currently, supported formats are `output_type:name` or `output_type`. + + :param output: The output dictionary from a notebook cell. + :param output_type: User-provided string to match against. + """ + + # Check if the ':' format is used, and if so, split into output_type and name + name = None + + if ':' in output_type: + output_type, name = output_type.split(':') + + return (output.get('output_type') == output_type + and (name is None or output.get('name') == name)) def strip_output( nb: NotebookNode, @@ -104,6 +123,8 @@ def strip_output( drop_empty_cells: bool = False, drop_tagged_cells: List[str] = [], strip_init_cells: bool = False, + drop_output_types: Set[str] = None, + keep_output_types: Set[str] = None, max_size: int = 0, ) -> NotebookNode: """ @@ -113,6 +134,11 @@ def strip_output( `extra_keys` could be 'metadata.foo cell.metadata.bar metadata.baz' """ + + # Replace mutable defaults + drop_output_types = drop_output_types or set() + keep_output_types = keep_output_types or set() + if keep_output is None and 'keep_output' in nb.metadata: keep_output = bool(nb.metadata['keep_output']) @@ -140,8 +166,10 @@ def strip_output( # Remove the outputs, unless directed otherwise if 'outputs' in cell: # Default behavior (max_size == 0) strips all outputs. - if not keep_output_this_cell: - cell['outputs'] = [output for output in cell['outputs'] if get_size(output) <= max_size] + if not keep_output_this_cell or keep_output_types: + cell['outputs'] = [output for output in cell['outputs'] + if get_size(output) <= max_size + or any(match_output_type(output, ot) for ot in keep_output_types)] # Strip the counts from the outputs that were kept if not keep_count. if not keep_count: @@ -149,6 +177,13 @@ def strip_output( if 'execution_count' in output: output['execution_count'] = None + # Remove specific output types + if drop_output_types: + cell['outputs'] = [ + output for output in cell['outputs'] + if not any(match_output_type(output, ot) for ot in drop_output_types) + ] + # If keep_output_this_cell and keep_count, do nothing. # Remove the prompt_number/execution_count, unless directed otherwise diff --git a/tests/test_output_types.ipynb b/tests/test_output_types.ipynb new file mode 100644 index 0000000..dd133e9 --- /dev/null +++ b/tests/test_output_types.ipynb @@ -0,0 +1,119 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1c2d582b-7d99-413d-9263-e1b4bdce4066", + "metadata": {}, + "source": [ + "# Test Notebook for nbstripout error types" + ] + }, + { + "cell_type": "code", + "id": "0fec6660-9485-42fd-90d6-5c5046096759", + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-19T17:19:29.492338Z", + "start_time": "2025-11-19T17:19:29.478763Z" + } + }, + "source": [ + "import sys\n", + "print('This is not an error')\n", + "sys.stderr.write('This is stderr\\n')\n", + "import notalibrary" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is not an error\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "This is stderr\n" + ] + }, + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'notalibrary'", + "output_type": "error", + "traceback": [ + "\u001B[31m---------------------------------------------------------------------------\u001B[39m", + "\u001B[31mModuleNotFoundError\u001B[39m Traceback (most recent call last)", + "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[3]\u001B[39m\u001B[32m, line 4\u001B[39m\n\u001B[32m 2\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m'\u001B[39m\u001B[33mThis is not an error\u001B[39m\u001B[33m'\u001B[39m)\n\u001B[32m 3\u001B[39m sys.stderr.write(\u001B[33m'\u001B[39m\u001B[33mThis is stderr\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m'\u001B[39m)\n\u001B[32m----> \u001B[39m\u001B[32m4\u001B[39m \u001B[38;5;28;01mimport\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mnotalibrary\u001B[39;00m\n", + "\u001B[31mModuleNotFoundError\u001B[39m: No module named 'notalibrary'" + ] + } + ], + "execution_count": 3 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-19T17:15:32.884433Z", + "start_time": "2025-11-19T17:15:32.881115Z" + } + }, + "cell_type": "code", + "source": "'a'", + "id": "abd5c048ed6667b7", + "outputs": [ + { + "data": { + "text/plain": [ + "'a'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 2 + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a060078e-cb50-4ed8-8dff-0a7938b8a897", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test\n" + ] + } + ], + "source": [ + "print('test')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_output_types.py b/tests/test_output_types.py new file mode 100644 index 0000000..748950b --- /dev/null +++ b/tests/test_output_types.py @@ -0,0 +1,89 @@ +import os +from copy import deepcopy + +import nbformat +import pytest + +from nbstripout import strip_output +directory = os.path.dirname(__file__) + +@pytest.fixture +def orig_nb(): + # Grab the original notebook + fname = 'test_output_types.ipynb' + return nbformat.read(os.path.join(directory, fname), nbformat.NO_CONVERT) + +def test_drop_errors(orig_nb): + """ + Confirm that --drop-output-types works as expected, + when asking just to drop `error` outputs. + """ + nb_stripped = strip_output(deepcopy(orig_nb), + keep_output=True, + keep_count=False, + keep_id=False, + drop_output_types={'error'}) + + # No outputs in the markdown + assert not hasattr(nb_stripped.cells[0], 'outputs') + + # Original cell should have 3 outputs, with the last being error + assert len(orig_nb.cells[1].outputs) == 3 + assert orig_nb.cells[1].outputs[2]['output_type'] == 'error' + + # Second cell should have a stdout stream, stderr stream and an error + stripped_output_1 = nb_stripped.cells[1].outputs[0] + stripped_output_2 = nb_stripped.cells[1].outputs[1] + assert len(nb_stripped.cells[1].outputs) == 2 + assert stripped_output_1['output_type'] == 'stream' + assert stripped_output_2['output_type'] == 'stream' + assert stripped_output_1['name'] == 'stdout' + assert stripped_output_2['name'] == 'stderr' + + # Third cell should have an execution output + assert len(nb_stripped.cells[2].outputs) == 1 + assert nb_stripped.cells[2].outputs[0]['output_type'] == 'execute_result' + +def test_keep_output(orig_nb): + """ + Confirm that --keep-output-types works as expected, + dropping all but `execute_result` outputs from the notebook. + """ + nb_stripped = strip_output(deepcopy(orig_nb), + keep_output=True, + keep_count=False, + keep_id=False, + keep_output_types={'execute_result'}) + + # No outputs in the markdown + assert not hasattr(nb_stripped.cells[0], 'outputs') + + # Original cell should have 3 outputs, with the last being error + assert len(orig_nb.cells[1].outputs) == 3 + assert orig_nb.cells[1].outputs[2]['output_type'] == 'error' + + # All outputs should be stripped in the second cell + assert len(nb_stripped.cells[1].outputs) == 0 + + # Third cell should have an execution output + assert len(nb_stripped.cells[2].outputs) == 1 + +def test_output_format_tags(orig_nb): + """ + Confirm that both : and formats work. + """ + nb_stripped = strip_output(deepcopy(orig_nb), + keep_output=False, + keep_count=False, + keep_id=False, + keep_output_types={'stream:stdout', 'execute_result'}) + + # No outputs in the markdown + assert not hasattr(nb_stripped.cells[0], 'outputs') + + # Stripping all but stdout should leave only the print statement + assert len(orig_nb.cells[1].outputs) == 3 + assert len(nb_stripped.cells[1].outputs) == 1 + + # Third cell should have only the execute_result + assert len(nb_stripped.cells[2].outputs) == 1