From 2c35db7a0759ae9c4c53297565e9b6bea631d31a Mon Sep 17 00:00:00 2001 From: TransZAllen Date: Tue, 26 Aug 2025 18:19:10 +0800 Subject: [PATCH 1/7] [Bug] Fix missing subtitle text in manually downloaded *.SRT files. (issue #10030) - Previously, *.SRT files only contained timestamps and sequence numbers, without the actual text content. - Added recursive text extraction to handle nested tags in TTML files.(e.g.: tags) --- .../newpipe/streams/SrtFromTtmlWriter.java | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index 7aff655a030..902d5f57c01 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -54,6 +54,30 @@ private void writeString(final String text) throws IOException { out.write(text.getBytes(charset)); } + /* + * Recursive method to extract text from all nodes + * - This method processes TextNode and
tags, recursively + * extracting text from nested tags. + * For example: extract text from nested tags + * - Appends newlines for
tags. + */ + private void extractText(final Node node, final StringBuilder text) { + if (node instanceof TextNode) { + text.append(((TextNode) node).text()); + } else if (node instanceof Element) { + final Element element = (Element) node; + //
is a self-closing HTML tag used to insert a line break. + if (element.tagName().equalsIgnoreCase("br")) { + // Add a newline for
tags + text.append(NEW_LINE); + } + } + // Recursively process child nodes + for (final Node child : node.childNodes()) { + extractText(child, text); + } + } + public void build(final SharpStream ttml) throws IOException { /* * TTML parser with BASIC support @@ -81,14 +105,8 @@ public void build(final SharpStream ttml) throws IOException { for (final Element paragraph : paragraphList) { text.setLength(0); - for (final Node children : paragraph.childNodes()) { - if (children instanceof TextNode) { - text.append(((TextNode) children).text()); - } else if (children instanceof Element - && ((Element) children).tagName().equalsIgnoreCase("br")) { - text.append(NEW_LINE); - } - } + // Recursively extract text from all child nodes + extractText(paragraph, text); if (ignoreEmptyFrames && text.length() < 1) { continue; From e1888ede8744829cf47716d987f2fc7d12a75de3 Mon Sep 17 00:00:00 2001 From: TobiGr Date: Wed, 27 Aug 2025 10:34:21 +0200 Subject: [PATCH 2/7] Fix JDoc and apply suggestions --- .../newpipe/streams/SrtFromTtmlWriter.java | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index 902d5f57c01..5f43185c695 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -54,18 +54,23 @@ private void writeString(final String text) throws IOException { out.write(text.getBytes(charset)); } - /* - * Recursive method to extract text from all nodes - * - This method processes TextNode and
tags, recursively - * extracting text from nested tags. - * For example: extract text from nested tags - * - Appends newlines for
tags. + // CHECKSTYLE:OFF checkstyle:JavadocStyle + // checkstyle does not understand that span tags are inside a code block + /** + *

Recursive method to extract text from all nodes.

+ *

+ * This method processes {@link TextNode}s and {@code
} tags, + * recursively extracting text from nested tags + * (e.g. extracting text from nested {@code } tags). + * Newlines are added for {@code
} tags. + *

+ * @param node the current node to process + * @param text the {@link StringBuilder} to append the extracted text to */ private void extractText(final Node node, final StringBuilder text) { - if (node instanceof TextNode) { - text.append(((TextNode) node).text()); - } else if (node instanceof Element) { - final Element element = (Element) node; + if (node instanceof TextNode textNode) { + text.append((textNode).text()); + } else if (node instanceof Element element) { //
is a self-closing HTML tag used to insert a line break. if (element.tagName().equalsIgnoreCase("br")) { // Add a newline for
tags @@ -77,6 +82,7 @@ private void extractText(final Node node, final StringBuilder text) { extractText(child, text); } } + // CHECKSTYLE:ON public void build(final SharpStream ttml) throws IOException { /* @@ -98,7 +104,7 @@ public void build(final SharpStream ttml) throws IOException { final Elements paragraphList = doc.select("body > div > p"); // check if has frames - if (paragraphList.size() < 1) { + if (paragraphList.isEmpty()) { return; } From 22ee01bcfbb8655a65fb6ee6f8373cec54987a4f Mon Sep 17 00:00:00 2001 From: TransZAllen Date: Thu, 18 Sep 2025 20:43:14 +0800 Subject: [PATCH 3/7] refactor(ttml): improve extractText() to preserve spaces and special characters - Replaced `text()` with `getWholeText()`: - avoids losing whitespaces at the beginning, end, or within the text; - avoids merging two or more consecutive spaces into a single space ' '; - avoids converting '\r', '\n', and '\r\n' within the text into a single space ' '; For subtitle conversion, the goal is to preserve every character exactly as intended by the subtitle author. - Normalized tabs, line breaks, and other special characters for SRT-safe output. - Added comprehensive unit tests in `SrtFromTtmlWriterTest.java`, including cases for simple and nested tags. --- .../newpipe/streams/SrtFromTtmlWriter.java | 169 ++++++++- .../streams/SrtFromTtmlWriterTest.java | 320 ++++++++++++++++++ 2 files changed, 488 insertions(+), 1 deletion(-) create mode 100644 app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index 5f43185c695..ad1e4d13a2f 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -54,6 +54,157 @@ private void writeString(final String text) throws IOException { out.write(text.getBytes(charset)); } + /** + * Decode XML or HTML entities into their actual (literal) characters. + * + * TTML is XML-based, so text nodes may contain escaped entities + * instead of direct characters. For example: + * + * "&" → "&" + * "<" → "<" + * ">" → ">" + * " " → "\t" (TAB) + * " " ( ) → "\n" (LINE FEED) + * + * XML files cannot contain characters like "<", ">", "&" directly, + * so they must be represented using their entity-encoded forms. + * + * Jsoup sometimes leaves nested or encoded entities unresolved + * (e.g. inside

text nodes in TTML files), so this function + * acts as a final “safety net” to ensure all entities are decoded + * before further normalization. + * + * Character representation layers for reference: + * - Literal characters: <, >, & + * → appear in runtime/output text (e.g. final SRT output) + * - Escaped entities: <, >, & + * → appear in XML/HTML/TTML source files + * - Numeric entities:  , , + * → appear mainly in XML/TTML files (also valid in HTML) + * for non-printable or special characters + * - Unicode escapes: \u00A0 (Java/Unicode internal form) + * → appear only in Java source code (NOT valid in XML) + * + * XML entities include both named (&, <) and numeric + * ( ,  ) forms. + * + * @param encodedEntities The raw text fragment possibly containing + * encoded XML entities. + * @return A decoded string where all entities are replaced by their + * actual (literal) characters. + */ + private String decodeXmlEntities(final String encodedEntities) { + final String decoded = Parser.unescapeEntities(encodedEntities, true); + return decoded; + } + + /** + * Handle rare XML entity characters like LF: (`\n`) + * , CR: (`\r`) and CRLF: (`\r\n`). + * + * These are technically valid in TTML (XML allows them) + * but unusual in practice, since most TTML line breaks + * are represented as
tags instead. + * As a defensive approach, we normalize them: + * + * - Windows (\r\n), macOS (\r), and Unix (\n) → unified SRT NEW_LINE (\r\n) + * + * Although well-formed TTML normally encodes line breaks + * as
tags, some auto-generated or malformed TTML files + * may embed literal newline entities ( , ). This + * normalization ensures these cases render properly in SRT + * players instead of breaking the subtitle structure. + * + * @param text To be normalized text with actual characters. + * @return Unified SRT NEW_LINE converted from all kinds of line breaks. + */ + private String normalizeLineBreakForSrt(final String text) { + String cleaned = text; + + // NOTE: + // The order of newline replacements must NOT change, + // or duplicated line breaks (e.g. \r\n → \n\n) will occur. + cleaned = cleaned.replace("\r\n", "\n") + .replace("\r", "\n"); + + cleaned = cleaned.replace("\n", NEW_LINE); + + return cleaned; + } + + private String normalizeForSrt(final String actualText) { + String cleaned = actualText; + + // Replace non-breaking space (\u00A0) with regular space ' '(\u0020). + // - YouTube TTML subtitles use both regular spaces (\u0020) + // and non-breaking spaces (\u00A0). + // - SRT subtitles only support regular spaces (\u0020), + // so \u00A0 may cause display issues. + // - \u00A0 and \u0020 are visually identical (i.e., they both + // appear as spaces ' '), but they differ in Unicode encoding, + // leading to test failures (e.g., ComparisonFailure). + // - Convert \u00A0 to \u0020 to ensure consistency in subtitle + // formatting. + // - References: + // - Unicode General Punctuation: https://unicode.org/charts/PDF/U2000.pdf + // - TTML Spec: https://www.w3.org/TR/ttml2/ + // - SRT Format: https://en.wikipedia.org/wiki/SubRip + cleaned = cleaned.replace('\u00A0', ' ') // Non-breaking space + .replace('\u202F', ' ') // Narrow no-break space + .replace('\u205F', ' ') // Medium mathematical space + .replace('\u3000', ' ') // Ideographic space + // \u2000 ~ \u200A are whitespace characters (e.g., + // en space, em space), replaced with regular space (\u0020). + .replaceAll("[\\u2000-\\u200A]", " "); // Whitespace characters + + // \u200B ~ \u200F are a range of non-spacing characters + // (e.g., zero-width space, zero-width non-joiner, etc.), + // which have no effect in *.SRT files and may cause + // display issues. + // These characters are invisible to the human eye, and + // they still exist in the encoding, so they need to be + // removed. + // After removal, the actual content becomes completely + // empty "", meaning there are no characters left, just + // an empty space, which helps avoid formatting issues + // in subtitles. + cleaned = cleaned.replaceAll("[\\u200B-\\u200F]", ""); // Non-spacing characters + + // Remove control characters (\u0000 ~ \u001F, except + // \n, \r, \t). + // - These are ASCII C0 control codes (e.g. \u0001 SOH, + // \u0008 BS, \u001F US), invisible and irrelevant in + // subtitles, may cause square boxes (?) in players. + // - Reference: + // Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf) + // ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters) + cleaned = cleaned.replaceAll("[\\u0000-\\u0008\\u000B\\u000C\\u000E-\\u001F]", ""); + + // Reasoning: + // - subtitle files generally don't require tabs for alignment. + // - Tabs can be displayed with varying widths across different + // editors or platforms, which may cause display issues. + // - Replace it with a single space for consistent display + // across different editors or platforms. + cleaned = cleaned.replace('\t', ' '); + + cleaned = normalizeLineBreakForSrt(cleaned); + + return cleaned; + } + + private String sanitizeFragment(final String raw) { + if (null == raw) { + return ""; + } + + final String actualCharacters = decodeXmlEntities(raw); + + final String srtSafeText = normalizeForSrt(actualCharacters); + + return srtSafeText; + } + // CHECKSTYLE:OFF checkstyle:JavadocStyle // checkstyle does not understand that span tags are inside a code block /** @@ -67,9 +218,25 @@ private void writeString(final String text) throws IOException { * @param node the current node to process * @param text the {@link StringBuilder} to append the extracted text to */ + // -------------------------------------------------------------------- + // [INTERNAL NOTE] TTML text layer explanation + // + // TTML parsing involves multiple text "layers": + // 1. Raw XML entities (e.g., <,  ) are decoded by Jsoup. + // 2. extractText() works on DOM TextNodes (already parsed strings). + // 3. sanitizeFragment() decodes remaining entities and fixes + // Unicode quirks. + // 4. normalizeForSrt() ensures literal text is safe for SRT output. + // + // In short: + // Jsoup handles XML-level syntax, + // our code handles text-level normalization for subtitles. + // -------------------------------------------------------------------- private void extractText(final Node node, final StringBuilder text) { if (node instanceof TextNode textNode) { - text.append((textNode).text()); + String rawTtmlFragment = textNode.getWholeText(); + String srtContent = sanitizeFragment(rawTtmlFragment); + text.append(srtContent); } else if (node instanceof Element element) { //
is a self-closing HTML tag used to insert a line break. if (element.tagName().equalsIgnoreCase("br")) { diff --git a/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java b/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java new file mode 100644 index 00000000000..755724f68a6 --- /dev/null +++ b/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java @@ -0,0 +1,320 @@ +package org.schabi.newpipe.streams; + +import org.junit.Test; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.parser.Parser; +import java.io.ByteArrayInputStream; +import java.lang.reflect.Method; +import java.nio.charset.StandardCharsets; +import static org.junit.Assert.assertEquals; + +/** + * Unit tests for {@link SrtFromTtmlWriter}. + * + * Tests focus on {@code extractText()} and its handling of TTML

elements. + * Note: + * - Uses reflection to call the private {@code extractText()} method. + * - Update {@code EXTRACT_TEXT_METHOD} if renamed. + * + * --- + * NOTE ABOUT ENTITIES VS UNICODE ESCAPES + * + * - In short: + * * UNICODE ESCAPES → used in Java source (e.g. SrtFromTtmlWriter.java) + * * ENTITIES → used in TTML strings (this test file) + * + * - TTML is an XML-based format. Real TTML subtitles often encode special + * characters as XML entities (named or numeric), e.g.: + * & → '&' (\u0026) + * < → '<' (\u003C) + * → tab (\u0009) + * → line feed (\u000A) + * → carriage return (\u000D) + * + * - Java source code uses **Unicode escapes** (e.g. "\u00A0") which are resolved + * at compile time, so they do not represent real XML entities. + * + * - Purpose of these tests: + * We simulate *real TTML input* as NewPipe receives it — i.e., strings that + * still contain encoded XML entities ( , , , etc.). + * The production code (`decodeXmlEntities()`) must convert these into their + * actual Unicode characters before normalization. + */ +public class SrtFromTtmlWriterTest { + private static final String TTML_WRAPPER_START = "

"; + private static final String TTML_WRAPPER_END = "
"; + private static final String EXTRACT_TEXT_METHOD = "extractText"; + // Please keep the same definition from `SrtFromTtmlWriter` class. + private static final String NEW_LINE = "\r\n"; + + /* + * TTML example for simple paragraph

without nested tags. + *

Hello World!

+ */ + private static final String SIMPLE_TTML = "

Hello World!

"; + /** + * TTML example with nested tags with
. + *

Hello
World!

+ */ + private static final String NESTED_TTML = "

" + + "Hello
World!

"; + + /** + * TTML example with HTML entities. + * < → <, > → >, & → &, " → ", ' → ' + * ' → ' + *   → ' ' + */ + private static final String ENTITY_TTML = "

" + + "<tag> & "text"''''" + + "  " + + "

"; + /** + * TTML example with special characters: + * - Spaces appear at the beginning and end of the text. + * - Spaces are also present within the text (not just at the edges). + * - The text includes various HTML entities such as  , + * &, <, >, etc. + *   → non-breaking space (Unicode: '\u00A0', Entity: ' ') + */ + private static final String SPECIAL_TTML = "

" + + " ~~-Hello  &&<<>>World!! " + + "

"; + + /** + * TTML example with characters: tab. + * → \t + * They are separated by '+' for clarity. + */ + private static final String TAB_TTML = "

" + + " + + " + + "

"; + + /** + * TTML example with line endings. + * → \r + */ + private static final String LINE_ENDING_0_TTML = "

" + + " + + " + + "

"; + // → \n + private static final String LINE_ENDING_1_TTML = "

" + + " + + " + + "

"; + private static final String LINE_ENDING_2_TTML = + "

" + + " + + " + + "

"; + + /** + * TTML example with control characters. + * For example: + *  → \u0001 + *  → \u001F + * + * These control characters, if included as raw Unicode(e.g. '\u0001'), + * are either invalid in XML or rendered as '?' when processed. + * To avoid issues, they should be encoded(e.g. '') in TTML file. + * + * - Reference: + * Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf), + * ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters). + * and the defination of these characters can be known. + */ + private static final String CONTROL_CHAR_TTML = "

" + + "++ + ++" + + "

"; + + + + private static final String EMPTY_TTML = "

" + + "" + + "

"; + + /** + * TTML example with Unicode space characters. + * These characters are encoded using character references + * (&#xXXXX;). + * + * Includes: + * ( ) '\u202F' → Narrow no-break space + * ( ) '\u205F' → Medium mathematical space + * ( ) '\u3000' → Ideographic space + * '\u2000' ~ '\u200A' are whitespace characters: + * ( ) '\u2000' → En quad + * ( ) '\u2002' → En space + * ( ) '\u200A' → Hair space + * + * Each character is separated by '+' for clarity. + */ + private static final String UNICODE_SPACE_TTML = "

" + + " + + + + + " + + "

"; + + /** + * TTML example with non-spacing (invisible) characters. + * These are encoded using character references (&#xXXXX;). + * + * Includes: + * (​)'\u200B' → Zero-width space (ZWSP) + * (‎)'\u200E' → Left-to-right mark (LRM) + * (‏)'\u200F' → Right-to-left mark (RLM) + * + * They don't display any characters to the human eye. + * '+' is used between them for clarity in test output. + */ + private static final String NON_SPACING_TTML = "

" + + "​+‎+‏" + + "

"; + + /** + * Parses TTML string into a JSoup Document and selects the first

element. + * + * @param ttmlContent TTML content (e.g.,

...

) + * @return the first

element + * @throws Exception if parsing or reflection fails + */ + private Element parseTtmlParagraph(final String ttmlContent) throws Exception { + final String ttml = TTML_WRAPPER_START + ttmlContent + TTML_WRAPPER_END; + final Document doc = Jsoup.parse( + new ByteArrayInputStream(ttml.getBytes(StandardCharsets.UTF_8)), + "UTF-8", "", Parser.xmlParser()); + return doc.select("body > div > p").first(); + } + + /** + * Invokes private extractText method via reflection. + * + * @param writer SrtFromTtmlWriter instance + * @param paragraph

element to extract text from + * @param text StringBuilder to store extracted text + * @throws Exception if reflection fails + */ + private void invokeExtractText(final SrtFromTtmlWriter writer, final Element paragraph, + final StringBuilder text) throws Exception { + final Method method = writer.getClass() + .getDeclaredMethod(EXTRACT_TEXT_METHOD, Node.class, StringBuilder.class); + method.setAccessible(true); + method.invoke(writer, paragraph, text); + } + + private String extractTextFromTtml(final String ttmlInput) throws Exception { + final Element paragraph = parseTtmlParagraph(ttmlInput); + final StringBuilder text = new StringBuilder(); + final SrtFromTtmlWriter writer = new SrtFromTtmlWriter(null, false); + invokeExtractText(writer, paragraph, text); + + final String actualText = text.toString(); + return actualText; + } + + @Test + public void testExtractTextSimpleParagraph() throws Exception { + final String expected = "Hello World!"; + final String actual = extractTextFromTtml(SIMPLE_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextNestedTags() throws Exception { + final String expected = "Hello\r\nWorld!"; + final String actual = extractTextFromTtml(NESTED_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextWithEntity() throws Exception { + final String expected = " & \"text\"'''' "; + final String actual = extractTextFromTtml(ENTITY_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextWithSpecialCharacters() throws Exception { + final String expected = " ~~-Hello &&<<>>World!! "; + final String actual = extractTextFromTtml(SPECIAL_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextWithTab() throws Exception { + final String expected = " + + "; + final String actual = extractTextFromTtml(TAB_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextWithLineEnding0() throws Exception { + final String expected = NEW_LINE + NEW_LINE + "+" + + NEW_LINE + NEW_LINE + "+" + + NEW_LINE + NEW_LINE; + final String actual = extractTextFromTtml(LINE_ENDING_0_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextWithLineEnding1() throws Exception { + final String expected = NEW_LINE + NEW_LINE + "+" + + NEW_LINE + NEW_LINE + "+" + + NEW_LINE + NEW_LINE; + final String actual = extractTextFromTtml(LINE_ENDING_1_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextWithLineEnding2() throws Exception { + final String expected = NEW_LINE + "+" + + NEW_LINE + "+" + + NEW_LINE; + final String actual = extractTextFromTtml(LINE_ENDING_2_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextWithControlCharacters() throws Exception { + final String expected = "+++++"; + final String actual = extractTextFromTtml(CONTROL_CHAR_TTML); + assertEquals(expected, actual); + } + + /** + * Test case to ensure that extractText() does not throw an exception + * when there are no text in the TTML paragraph (i.e., the paragraph + * is empty). + * + * Note: + * In the NewPipe, *.srt files will contain empty text lines by default. + */ + @Test + public void testExtractTextWithEmpty() throws Exception { + final String expected = ""; + final String actual = extractTextFromTtml(EMPTY_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextWithUnicodeSpaces() throws Exception { + final String expected = " + + + + + "; + final String actual = extractTextFromTtml(UNICODE_SPACE_TTML); + assertEquals(expected, actual); + } + + @Test + public void testExtractTextWithNonSpacingCharacters() throws Exception { + final String expected = "++"; + final String actual = extractTextFromTtml(NON_SPACING_TTML); + assertEquals(expected, actual); + } +} From 35166676716c4e84f57977945651d99b0d5f0859 Mon Sep 17 00:00:00 2001 From: TransZAllen Date: Fri, 17 Oct 2025 12:04:02 +0800 Subject: [PATCH 4/7] refactor(ttml): extract recursion into `traverseChildNodesForNestedTags()` - Extracted child-node traversal logic from `extractText()` into a helper method `traverseChildNodesForNestedTags()`. - No functional change. --- .../schabi/newpipe/streams/SrtFromTtmlWriter.java | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index ad1e4d13a2f..bea3422fc2e 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -205,6 +205,15 @@ private String sanitizeFragment(final String raw) { return srtSafeText; } + // Recursively process all child nodes to ensure text inside + // nested tags (e.g., ) is also extracted. + private void traverseChildNodesForNestedTags(final Node parent, + final StringBuilder text) { + for (final Node child : parent.childNodes()) { + extractText(child, text); + } + } + // CHECKSTYLE:OFF checkstyle:JavadocStyle // checkstyle does not understand that span tags are inside a code block /** @@ -244,10 +253,8 @@ private void extractText(final Node node, final StringBuilder text) { text.append(NEW_LINE); } } - // Recursively process child nodes - for (final Node child : node.childNodes()) { - extractText(child, text); - } + + traverseChildNodesForNestedTags(node, text); } // CHECKSTYLE:ON From 71aa6d52d321110a530e44df38fc4a501de02c2c Mon Sep 17 00:00:00 2001 From: TransZAllen Date: Tue, 28 Oct 2025 17:39:04 +0800 Subject: [PATCH 5/7] Update app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java Co-authored-by: Tobi --- .../java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index bea3422fc2e..aaf7bff696e 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -99,8 +99,8 @@ private String decodeXmlEntities(final String encodedEntities) { } /** - * Handle rare XML entity characters like LF: (`\n`) - * , CR: (`\r`) and CRLF: (`\r\n`). + * Handle rare XML entity characters like LF: (`\n`), + * CR: (`\r`) and CRLF: (`\r\n`). * * These are technically valid in TTML (XML allows them) * but unusual in practice, since most TTML line breaks From d311faea58a3ee03e95b3473924547c87ccded11 Mon Sep 17 00:00:00 2001 From: TransZAllen Date: Wed, 29 Oct 2025 18:52:57 +0800 Subject: [PATCH 6/7] =?UTF-8?q?improve=20comments=20on=20TTML=20=E2=86=92?= =?UTF-8?q?=20SRT=20conversion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - update class header with proper technical references and remove author tag. - update comments of replacing NBSP('\u00A0'), especially adding examples of rendering incorrectly. --- .../newpipe/streams/SrtFromTtmlWriter.java | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index aaf7bff696e..6f584d055c5 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -15,7 +15,11 @@ import java.nio.charset.StandardCharsets; /** - * @author kapodamy + * Converts TTML subtitles to SRT format. + * + * References: + * - TTML 2.0 (W3C): https://www.w3.org/TR/ttml2/ + * - SRT format: https://en.wikipedia.org/wiki/SubRip */ public class SrtFromTtmlWriter { private static final String NEW_LINE = "\r\n"; @@ -135,20 +139,37 @@ private String normalizeLineBreakForSrt(final String text) { private String normalizeForSrt(final String actualText) { String cleaned = actualText; - // Replace non-breaking space (\u00A0) with regular space ' '(\u0020). + // Replace NBSP "non-breaking space" (\u00A0) with regular space ' '(\u0020). + // + // Why: + // - Some viewers render NBSP(\u00A0) incorrectly: + // * MPlayer 1.5: shown as “??” + // * Linux command `cat -A`: displayed as control-like markers + // (M-BM-) + // * Acode (Android editor): displayed as visible replacement + // glyphs (red dots) + // - Other viewers show it as a normal space (e.g., VS Code 1.104.0, + // vlc 3.0.20, mpv 0.37.0, Totem 43.0) + // → Mixed rendering creates inconsistency and may confuse users. + // + // Details: // - YouTube TTML subtitles use both regular spaces (\u0020) // and non-breaking spaces (\u00A0). // - SRT subtitles only support regular spaces (\u0020), // so \u00A0 may cause display issues. // - \u00A0 and \u0020 are visually identical (i.e., they both // appear as spaces ' '), but they differ in Unicode encoding, - // leading to test failures (e.g., ComparisonFailure). - // - Convert \u00A0 to \u0020 to ensure consistency in subtitle - // formatting. - // - References: - // - Unicode General Punctuation: https://unicode.org/charts/PDF/U2000.pdf - // - TTML Spec: https://www.w3.org/TR/ttml2/ - // - SRT Format: https://en.wikipedia.org/wiki/SubRip + // and NBSP (\u00A0) renders differently in different viewers. + // - SRT is a plain-text format and does not interpret + // "non-breaking" behavior. + // + // Conclusion: + // - Ensure uniform behavior, so replace it to a regular space + // without "non-breaking" behavior. + // + // References: + // - Unicode U+00A0 NBSP (Latin-1 Supplement): + // https://unicode.org/charts/PDF/U0080.pdf cleaned = cleaned.replace('\u00A0', ' ') // Non-breaking space .replace('\u202F', ' ') // Narrow no-break space .replace('\u205F', ' ') // Medium mathematical space From 300afde83d7203187d6249cd203a977f8874b5e4 Mon Sep 17 00:00:00 2001 From: TransZAllen Date: Wed, 29 Oct 2025 22:34:47 +0800 Subject: [PATCH 7/7] Update app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java Co-authored-by: Tobi --- .../java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index 6f584d055c5..652053e45db 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -98,8 +98,7 @@ private void writeString(final String text) throws IOException { * actual (literal) characters. */ private String decodeXmlEntities(final String encodedEntities) { - final String decoded = Parser.unescapeEntities(encodedEntities, true); - return decoded; + return Parser.unescapeEntities(encodedEntities, true); } /**