Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ Note that you need to uninstall with the same flags:
`nbstripout` can be used to rewrite an existing Git repository using
[`git filter-repo`](https://github.com/newren/git-filter-repo) to strip output
from existing notebooks. This invocation operates on all `.ipynb` files in the repo:

```bash
#!/bin/bash
git-filter-repo \
Expand Down Expand Up @@ -324,10 +325,28 @@ Do not strip the output, only metadata:

nbstripout --keep-output

##### Output Types

When keeping the output, drop a specific [`output_type`](https://ipython.readthedocs.io/en/3.x/notebook/nbformat.html#code-cell-outputs), like `error` or `stream`

nbstripout --drop-output-type error stream

Drop all output except specific output types:

nbstripout --keep-output-type execute_result

For stripping certain outputs that have names (like `stream` which can be `stderr` or `stdout`) you can use a colon to specify the name. The following would strip all `stderr` output.

nbstripout --drop-output-type stream:stderr

##### Cell IDs

Do not reassign the cell ids to be sequential (which is the default behavior):

nbstripout --keep-id

##### Keeping Output on Specific Cells

To mark special cells so that the output is not stripped, you can either:

1. Set the `keep_output` tag on the cell. To do this, enable the tags toolbar
Expand Down
4 changes: 4 additions & 0 deletions nbstripout/_nbstripout.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,8 @@ def process_jupyter_notebook(
drop_empty_cells=args.drop_empty_cells,
drop_tagged_cells=args.drop_tagged_cells.split(),
strip_init_cells=args.strip_init_cells,
drop_output_types=set(args.drop_output_type),
keep_output_types = set(args.keep_output_type),
max_size=_parse_size(args.max_size),
)

Expand Down Expand Up @@ -451,6 +453,8 @@ def main():
)
parser.add_argument('--keep-count', action='store_true', help='Do not strip the execution count/prompt number')
parser.add_argument('--keep-output', action='store_true', help='Do not strip output', default=None)
parser.add_argument('--drop-output-type', help='Types of output cells to drop, e.g. "error" or "stream". Only has effect with --keep-output', nargs='+', default=[])
parser.add_argument('--keep-output-type', help='Types of output cells to keep, e.g. "error" or "stream". Will take effect without --keep-output', nargs='+', default=[])
parser.add_argument(
'--keep-id',
action='store_true',
Expand Down
54 changes: 51 additions & 3 deletions nbstripout/_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from collections import defaultdict
import sys
from typing import Any, Callable, Iterator, List, Optional
from typing import Any, Callable, Iterator, List, Optional, Set, Dict
import logging
logger = logging.getLogger(__name__)

from nbformat import NotebookNode

Expand Down Expand Up @@ -94,6 +96,34 @@ def strip_zeppelin_output(nb: dict) -> dict:
cell['results'] = {}
return nb

class OutputType:
output_type: str
name: str

def __init__(self, output_type: str,
name: str = None):
self.output_type = output_type
self.name = name

@classmethod
def from_string(cls, s: str) -> 'OutputType':
if ':' in s:
return cls(*s.split(':'))
else:
return cls(s)

def matches_output(self, output: Dict):
return (output.get('output_type') == self.output_type and
(self.name is None or output.get('name') == self.name))

def __hash__(self):
return hash(self.output_type + str(self.name))

def __repr__(self):
return f'{self.output_type}:{self.name}'




def strip_output(
nb: NotebookNode,
Expand All @@ -104,6 +134,8 @@ def strip_output(
drop_empty_cells: bool = False,
drop_tagged_cells: List[str] = [],
strip_init_cells: bool = False,
drop_output_types: Set[str] = None,
keep_output_types: Set[str] = None,
max_size: int = 0,
) -> NotebookNode:
"""
Expand All @@ -113,6 +145,11 @@ def strip_output(

`extra_keys` could be 'metadata.foo cell.metadata.bar metadata.baz'
"""

# Replace mutable defaults
drop_output_types = {OutputType.from_string(s) for s in drop_output_types} if drop_output_types else set()
keep_output_types = {OutputType.from_string(s) for s in keep_output_types} if keep_output_types else set()

if keep_output is None and 'keep_output' in nb.metadata:
keep_output = bool(nb.metadata['keep_output'])

Expand Down Expand Up @@ -140,15 +177,26 @@ def strip_output(
# Remove the outputs, unless directed otherwise
if 'outputs' in cell:
# Default behavior (max_size == 0) strips all outputs.
if not keep_output_this_cell:
cell['outputs'] = [output for output in cell['outputs'] if get_size(output) <= max_size]
if not keep_output_this_cell or keep_output_types:
cell['outputs'] = [output for output in cell['outputs']
if get_size(output) <= max_size
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this actually take precedence? I don't mind either way but suggest documenting in the README and/or docstring.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, not sure; my instinct is the main use case for keeping an output type would be to preserve graphs/images/whatever in all cases, but that's just a guess based on a teammate's use case. Think I added a line to the Readme, but will confirm.

or any([ot.matches_output(output) for ot in keep_output_types])]

# Strip the counts from the outputs that were kept if not keep_count.
if not keep_count:
for output in cell['outputs']:
if 'execution_count' in output:
output['execution_count'] = None



# Remove specific output types
if drop_output_types:
cell['outputs'] = [
output for output in cell['outputs']
if not any([ot.matches_output(output) for ot in drop_output_types])
]

# If keep_output_this_cell and keep_count, do nothing.

# Remove the prompt_number/execution_count, unless directed otherwise
Expand Down
119 changes: 119 additions & 0 deletions tests/test_output_types.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "1c2d582b-7d99-413d-9263-e1b4bdce4066",
"metadata": {},
"source": [
"# Test Notebook for nbstripout error types"
]
},
{
"cell_type": "code",
"id": "0fec6660-9485-42fd-90d6-5c5046096759",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-19T17:19:29.492338Z",
"start_time": "2025-11-19T17:19:29.478763Z"
}
},
"source": [
"import sys\n",
"print('This is not an error')\n",
"sys.stderr.write('This is stderr\\n')\n",
"import notalibrary"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"This is not an error\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"This is stderr\n"
]
},
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'notalibrary'",
"output_type": "error",
"traceback": [
"\u001B[31m---------------------------------------------------------------------------\u001B[39m",
"\u001B[31mModuleNotFoundError\u001B[39m Traceback (most recent call last)",
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[3]\u001B[39m\u001B[32m, line 4\u001B[39m\n\u001B[32m 2\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m'\u001B[39m\u001B[33mThis is not an error\u001B[39m\u001B[33m'\u001B[39m)\n\u001B[32m 3\u001B[39m sys.stderr.write(\u001B[33m'\u001B[39m\u001B[33mThis is stderr\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m'\u001B[39m)\n\u001B[32m----> \u001B[39m\u001B[32m4\u001B[39m \u001B[38;5;28;01mimport\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mnotalibrary\u001B[39;00m\n",
"\u001B[31mModuleNotFoundError\u001B[39m: No module named 'notalibrary'"
]
}
],
"execution_count": 3
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-19T17:15:32.884433Z",
"start_time": "2025-11-19T17:15:32.881115Z"
}
},
"cell_type": "code",
"source": "'a'",
"id": "abd5c048ed6667b7",
"outputs": [
{
"data": {
"text/plain": [
"'a'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 2
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a060078e-cb50-4ed8-8dff-0a7938b8a897",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test\n"
]
}
],
"source": [
"print('test')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
90 changes: 90 additions & 0 deletions tests/test_output_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
from copy import deepcopy

import nbformat
import pytest

from nbstripout import strip_output
directory = os.path.dirname(__file__)

@pytest.fixture
def orig_nb():
fname = 'test_output_types.ipynb'
return nbformat.read(os.path.join(directory, fname), nbformat.NO_CONVERT)

def test_drop_errors(orig_nb):
nb_stripped = strip_output(deepcopy(orig_nb),
keep_output=True,
keep_count=False,
keep_id=False,
drop_output_types={'error'})

# No outputs in the markdown
assert not hasattr(nb_stripped.cells[0], 'outputs')

# Original cell should have 3 outputs, with the last being error
assert len(orig_nb.cells[1].outputs) == 3
assert orig_nb.cells[1].outputs[2]['output_type'] == 'error'

# Second cell should have a stdout stream, stderr stream and an error
stripped_output_1 = nb_stripped.cells[1].outputs[0]
stripped_output_2 = nb_stripped.cells[1].outputs[1]
assert len(nb_stripped.cells[1].outputs) == 2
assert stripped_output_1['output_type'] == 'stream'
assert stripped_output_2['output_type'] == 'stream'
assert stripped_output_1['name'] == 'stdout'
assert stripped_output_2['name'] == 'stderr'

# Third cell should have an execution output
assert len(nb_stripped.cells[2].outputs) == 1
assert nb_stripped.cells[2].outputs[0]['output_type'] == 'execute_result'

# Should be an error in the original cell, but not in the output
# assert orig_nb.cells[1].outputs[0]['output_type'] == 'error'
# print(nb_stripped.cells[1].outputs)


def test_keep_output(orig_nb):
"""
Te4st keep output types
"""
nb_stripped = strip_output(deepcopy(orig_nb),
keep_output=True,
keep_count=False,
keep_id=False,
keep_output_types={'execute_result'})

# No outputs in the markdown
assert not hasattr(nb_stripped.cells[0], 'outputs')

# Original cell should have 3 outputs, with the last being error
assert len(orig_nb.cells[1].outputs) == 3
assert orig_nb.cells[1].outputs[2]['output_type'] == 'error'

print(nb_stripped.cells[2].outputs)

# All outputs should be stripped in the second cell
assert len(nb_stripped.cells[1].outputs) == 0

# Third cell should have an execution output
assert len(nb_stripped.cells[2].outputs) == 1

def test_output_format_tags(orig_nb):
"""
Te4st keep output types
"""
nb_stripped = strip_output(deepcopy(orig_nb),
keep_output=False,
keep_count=False,
keep_id=False,
keep_output_types={'stream:stdout', 'execute_result'})

# No outputs in the markdown
assert not hasattr(nb_stripped.cells[0], 'outputs')

# Stripping all but stdout should leave only the print statement
assert len(orig_nb.cells[1].outputs) == 3
assert len(nb_stripped.cells[1].outputs) == 1

# Third cell should have only the execute_result
assert len(nb_stripped.cells[2].outputs) == 1