-
Notifications
You must be signed in to change notification settings - Fork 101
Add —drop-output-type arg #204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
262b4f1
8ae87a0
db6dd39
912839f
407c27b
76ef7dd
b5a7d8a
3071c32
7dd1d4a
7ed6dba
3400342
8e44e3c
0b5c608
f22da39
78f4013
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,8 @@ | ||
| from collections import defaultdict | ||
| import sys | ||
| from typing import Any, Callable, Iterator, List, Optional | ||
| from typing import Any, Callable, Iterator, List, Optional, Set, Dict | ||
| import logging | ||
| logger = logging.getLogger(__name__) | ||
rgeorgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| from nbformat import NotebookNode | ||
|
|
||
|
|
@@ -94,6 +96,34 @@ def strip_zeppelin_output(nb: dict) -> dict: | |
| cell['results'] = {} | ||
| return nb | ||
|
|
||
| class OutputType: | ||
rgeorgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| output_type: str | ||
| name: str | ||
|
|
||
| def __init__(self, output_type: str, | ||
| name: str = None): | ||
| self.output_type = output_type | ||
| self.name = name | ||
|
|
||
| @classmethod | ||
| def from_string(cls, s: str) -> 'OutputType': | ||
rgeorgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if ':' in s: | ||
| return cls(*s.split(':')) | ||
| else: | ||
| return cls(s) | ||
|
|
||
| def matches_output(self, output: Dict): | ||
| return (output.get('output_type') == self.output_type and | ||
| (self.name is None or output.get('name') == self.name)) | ||
|
|
||
| def __hash__(self): | ||
| return hash(self.output_type + str(self.name)) | ||
|
|
||
| def __repr__(self): | ||
| return f'{self.output_type}:{self.name}' | ||
|
|
||
|
|
||
|
|
||
|
|
||
| def strip_output( | ||
| nb: NotebookNode, | ||
|
|
@@ -104,6 +134,8 @@ def strip_output( | |
| drop_empty_cells: bool = False, | ||
| drop_tagged_cells: List[str] = [], | ||
| strip_init_cells: bool = False, | ||
| drop_output_types: Set[str] = None, | ||
| keep_output_types: Set[str] = None, | ||
| max_size: int = 0, | ||
| ) -> NotebookNode: | ||
| """ | ||
|
|
@@ -113,6 +145,11 @@ def strip_output( | |
|
|
||
| `extra_keys` could be 'metadata.foo cell.metadata.bar metadata.baz' | ||
| """ | ||
|
|
||
| # Replace mutable defaults | ||
| drop_output_types = {OutputType.from_string(s) for s in drop_output_types} if drop_output_types else set() | ||
| keep_output_types = {OutputType.from_string(s) for s in keep_output_types} if keep_output_types else set() | ||
|
|
||
| if keep_output is None and 'keep_output' in nb.metadata: | ||
| keep_output = bool(nb.metadata['keep_output']) | ||
|
|
||
|
|
@@ -140,15 +177,26 @@ def strip_output( | |
| # Remove the outputs, unless directed otherwise | ||
| if 'outputs' in cell: | ||
| # Default behavior (max_size == 0) strips all outputs. | ||
| if not keep_output_this_cell: | ||
| cell['outputs'] = [output for output in cell['outputs'] if get_size(output) <= max_size] | ||
| if not keep_output_this_cell or keep_output_types: | ||
| cell['outputs'] = [output for output in cell['outputs'] | ||
| if get_size(output) <= max_size | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this actually take precedence? I don't mind either way but suggest documenting in the README and/or docstring.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, not sure; my instinct is the main use case for keeping an output type would be to preserve graphs/images/whatever in all cases, but that's just a guess based on a teammate's use case. Think I added a line to the Readme, but will confirm. |
||
| or any([ot.matches_output(output) for ot in keep_output_types])] | ||
rgeorgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Strip the counts from the outputs that were kept if not keep_count. | ||
| if not keep_count: | ||
| for output in cell['outputs']: | ||
| if 'execution_count' in output: | ||
| output['execution_count'] = None | ||
|
|
||
|
|
||
|
|
||
| # Remove specific output types | ||
rgeorgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if drop_output_types: | ||
| cell['outputs'] = [ | ||
| output for output in cell['outputs'] | ||
| if not any([ot.matches_output(output) for ot in drop_output_types]) | ||
rgeorgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ] | ||
|
|
||
| # If keep_output_this_cell and keep_count, do nothing. | ||
|
|
||
| # Remove the prompt_number/execution_count, unless directed otherwise | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| { | ||
| "cells": [ | ||
| { | ||
| "cell_type": "markdown", | ||
| "id": "1c2d582b-7d99-413d-9263-e1b4bdce4066", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "# Test Notebook for nbstripout error types" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "id": "0fec6660-9485-42fd-90d6-5c5046096759", | ||
| "metadata": { | ||
| "ExecuteTime": { | ||
| "end_time": "2025-11-19T17:19:29.492338Z", | ||
| "start_time": "2025-11-19T17:19:29.478763Z" | ||
| } | ||
| }, | ||
| "source": [ | ||
| "import sys\n", | ||
| "print('This is not an error')\n", | ||
| "sys.stderr.write('This is stderr\\n')\n", | ||
| "import notalibrary" | ||
| ], | ||
| "outputs": [ | ||
| { | ||
| "name": "stdout", | ||
| "output_type": "stream", | ||
| "text": [ | ||
| "This is not an error\n" | ||
| ] | ||
| }, | ||
| { | ||
| "name": "stderr", | ||
| "output_type": "stream", | ||
| "text": [ | ||
| "This is stderr\n" | ||
| ] | ||
| }, | ||
| { | ||
| "ename": "ModuleNotFoundError", | ||
| "evalue": "No module named 'notalibrary'", | ||
| "output_type": "error", | ||
| "traceback": [ | ||
| "\u001B[31m---------------------------------------------------------------------------\u001B[39m", | ||
| "\u001B[31mModuleNotFoundError\u001B[39m Traceback (most recent call last)", | ||
| "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[3]\u001B[39m\u001B[32m, line 4\u001B[39m\n\u001B[32m 2\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m'\u001B[39m\u001B[33mThis is not an error\u001B[39m\u001B[33m'\u001B[39m)\n\u001B[32m 3\u001B[39m sys.stderr.write(\u001B[33m'\u001B[39m\u001B[33mThis is stderr\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m'\u001B[39m)\n\u001B[32m----> \u001B[39m\u001B[32m4\u001B[39m \u001B[38;5;28;01mimport\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mnotalibrary\u001B[39;00m\n", | ||
| "\u001B[31mModuleNotFoundError\u001B[39m: No module named 'notalibrary'" | ||
| ] | ||
| } | ||
| ], | ||
| "execution_count": 3 | ||
| }, | ||
| { | ||
| "metadata": { | ||
| "ExecuteTime": { | ||
| "end_time": "2025-11-19T17:15:32.884433Z", | ||
| "start_time": "2025-11-19T17:15:32.881115Z" | ||
| } | ||
| }, | ||
| "cell_type": "code", | ||
| "source": "'a'", | ||
| "id": "abd5c048ed6667b7", | ||
| "outputs": [ | ||
| { | ||
| "data": { | ||
| "text/plain": [ | ||
| "'a'" | ||
| ] | ||
| }, | ||
| "execution_count": 2, | ||
| "metadata": {}, | ||
| "output_type": "execute_result" | ||
| } | ||
| ], | ||
| "execution_count": 2 | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": 3, | ||
| "id": "a060078e-cb50-4ed8-8dff-0a7938b8a897", | ||
| "metadata": {}, | ||
| "outputs": [ | ||
| { | ||
| "name": "stdout", | ||
| "output_type": "stream", | ||
| "text": [ | ||
| "test\n" | ||
| ] | ||
| } | ||
| ], | ||
| "source": [ | ||
| "print('test')" | ||
| ] | ||
| } | ||
| ], | ||
| "metadata": { | ||
| "kernelspec": { | ||
| "display_name": "Python 3 (ipykernel)", | ||
| "language": "python", | ||
| "name": "python3" | ||
| }, | ||
| "language_info": { | ||
| "codemirror_mode": { | ||
| "name": "ipython", | ||
| "version": 3 | ||
| }, | ||
| "file_extension": ".py", | ||
| "mimetype": "text/x-python", | ||
| "name": "python", | ||
| "nbconvert_exporter": "python", | ||
| "pygments_lexer": "ipython3", | ||
| "version": "3.13.3" | ||
| } | ||
| }, | ||
| "nbformat": 4, | ||
| "nbformat_minor": 5 | ||
| } |
rgeorgi marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| import os | ||
| from copy import deepcopy | ||
|
|
||
| import nbformat | ||
| import pytest | ||
|
|
||
| from nbstripout import strip_output | ||
| directory = os.path.dirname(__file__) | ||
|
|
||
| @pytest.fixture | ||
| def orig_nb(): | ||
| fname = 'test_output_types.ipynb' | ||
| return nbformat.read(os.path.join(directory, fname), nbformat.NO_CONVERT) | ||
|
|
||
| def test_drop_errors(orig_nb): | ||
| nb_stripped = strip_output(deepcopy(orig_nb), | ||
| keep_output=True, | ||
| keep_count=False, | ||
| keep_id=False, | ||
| drop_output_types={'error'}) | ||
|
|
||
| # No outputs in the markdown | ||
| assert not hasattr(nb_stripped.cells[0], 'outputs') | ||
|
|
||
| # Original cell should have 3 outputs, with the last being error | ||
| assert len(orig_nb.cells[1].outputs) == 3 | ||
| assert orig_nb.cells[1].outputs[2]['output_type'] == 'error' | ||
|
|
||
| # Second cell should have a stdout stream, stderr stream and an error | ||
| stripped_output_1 = nb_stripped.cells[1].outputs[0] | ||
| stripped_output_2 = nb_stripped.cells[1].outputs[1] | ||
| assert len(nb_stripped.cells[1].outputs) == 2 | ||
| assert stripped_output_1['output_type'] == 'stream' | ||
| assert stripped_output_2['output_type'] == 'stream' | ||
| assert stripped_output_1['name'] == 'stdout' | ||
| assert stripped_output_2['name'] == 'stderr' | ||
|
|
||
| # Third cell should have an execution output | ||
| assert len(nb_stripped.cells[2].outputs) == 1 | ||
| assert nb_stripped.cells[2].outputs[0]['output_type'] == 'execute_result' | ||
|
|
||
| # Should be an error in the original cell, but not in the output | ||
| # assert orig_nb.cells[1].outputs[0]['output_type'] == 'error' | ||
| # print(nb_stripped.cells[1].outputs) | ||
rgeorgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| def test_keep_output(orig_nb): | ||
| """ | ||
| Te4st keep output types | ||
| """ | ||
rgeorgi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| nb_stripped = strip_output(deepcopy(orig_nb), | ||
| keep_output=True, | ||
| keep_count=False, | ||
| keep_id=False, | ||
| keep_output_types={'execute_result'}) | ||
|
|
||
| # No outputs in the markdown | ||
| assert not hasattr(nb_stripped.cells[0], 'outputs') | ||
|
|
||
| # Original cell should have 3 outputs, with the last being error | ||
| assert len(orig_nb.cells[1].outputs) == 3 | ||
| assert orig_nb.cells[1].outputs[2]['output_type'] == 'error' | ||
|
|
||
| print(nb_stripped.cells[2].outputs) | ||
rgeorgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # All outputs should be stripped in the second cell | ||
| assert len(nb_stripped.cells[1].outputs) == 0 | ||
|
|
||
| # Third cell should have an execution output | ||
| assert len(nb_stripped.cells[2].outputs) == 1 | ||
|
|
||
| def test_output_format_tags(orig_nb): | ||
| """ | ||
| Te4st keep output types | ||
| """ | ||
| nb_stripped = strip_output(deepcopy(orig_nb), | ||
| keep_output=False, | ||
| keep_count=False, | ||
| keep_id=False, | ||
| keep_output_types={'stream:stdout', 'execute_result'}) | ||
|
|
||
| # No outputs in the markdown | ||
| assert not hasattr(nb_stripped.cells[0], 'outputs') | ||
|
|
||
| # Stripping all but stdout should leave only the print statement | ||
| assert len(orig_nb.cells[1].outputs) == 3 | ||
| assert len(nb_stripped.cells[1].outputs) == 1 | ||
|
|
||
| # Third cell should have only the execute_result | ||
| assert len(nb_stripped.cells[2].outputs) == 1 | ||
Uh oh!
There was an error while loading. Please reload this page.