From 2c35db7a0759ae9c4c53297565e9b6bea631d31a Mon Sep 17 00:00:00 2001
From: TransZAllen <tree.story@outlook.com>
Date: Tue, 26 Aug 2025 18:19:10 +0800
Subject: [PATCH 1/7] [Bug] Fix missing subtitle text in manually downloaded
 *.SRT files. (issue #10030)

- Previously, *.SRT files only contained timestamps and sequence numbers, without the actual text content.
- Added recursive text extraction to handle nested tags in TTML
  files.(e.g.: <span> tags)
---
 .../newpipe/streams/SrtFromTtmlWriter.java    | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)
diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
index 7aff655a030..902d5f57c01 100644
--- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
+++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
@@ -54,6 +54,30 @@ private void writeString(final String text) throws IOException {
         out.write(text.getBytes(charset));
     }
 
+    /*
+     *   Recursive method to extract text from all nodes
+     *   - This method processes TextNode and <br> tags, recursively
+     *     extracting text from nested tags.
+     *     For example: extract text from nested <span> tags
+     *   - Appends newlines for <br> tags.
+     */
+    private void extractText(final Node node, final StringBuilder text) {
+        if (node instanceof TextNode) {
+            text.append(((TextNode) node).text());
+        } else if (node instanceof Element) {
+            final Element element = (Element) node;
+            // <br> is a self-closing HTML tag used to insert a line break.
+            if (element.tagName().equalsIgnoreCase("br")) {
+                // Add a newline for <br> tags
+                text.append(NEW_LINE);
+            }
+        }
+        // Recursively process child nodes
+        for (final Node child : node.childNodes()) {
+            extractText(child, text);
+        }
+    }
+
     public void build(final SharpStream ttml) throws IOException {
         /*
          * TTML parser with BASIC support
@@ -81,14 +105,8 @@ public void build(final SharpStream ttml) throws IOException {
         for (final Element paragraph : paragraphList) {
             text.setLength(0);
 
-            for (final Node children : paragraph.childNodes()) {
-                if (children instanceof TextNode) {
-                    text.append(((TextNode) children).text());
-                } else if (children instanceof Element
-                        && ((Element) children).tagName().equalsIgnoreCase("br")) {
-                    text.append(NEW_LINE);
-                }
-            }
+            // Recursively extract text from all child nodes
+            extractText(paragraph, text);
 
             if (ignoreEmptyFrames && text.length() < 1) {
                 continue;

From e1888ede8744829cf47716d987f2fc7d12a75de3 Mon Sep 17 00:00:00 2001
From: TobiGr <tobigr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 10:34:21 +0200
Subject: [PATCH 2/7] Fix JDoc and apply suggestions

---
 .../newpipe/streams/SrtFromTtmlWriter.java    | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
index 902d5f57c01..5f43185c695 100644
--- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
+++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
@@ -54,18 +54,23 @@ private void writeString(final String text) throws IOException {
         out.write(text.getBytes(charset));
     }
 
-    /*
-     *   Recursive method to extract text from all nodes
-     *   - This method processes TextNode and <br> tags, recursively
-     *     extracting text from nested tags.
-     *     For example: extract text from nested <span> tags
-     *   - Appends newlines for <br> tags.
+    // CHECKSTYLE:OFF checkstyle:JavadocStyle
+    // checkstyle does not understand that span tags are inside a code block
+    /**
+     * <p>Recursive method to extract text from all nodes.</p>
+     * <p>
+     *   This method processes {@link TextNode}s and {@code <br>} tags,
+     *   recursively extracting text from nested tags
+     *   (e.g. extracting text from nested {@code <span>} tags).
+     *   Newlines are added for {@code <br>} tags.
+     * </p>
+     * @param node the current node to process
+     * @param text the {@link StringBuilder} to append the extracted text to
      */
     private void extractText(final Node node, final StringBuilder text) {
-        if (node instanceof TextNode) {
-            text.append(((TextNode) node).text());
-        } else if (node instanceof Element) {
-            final Element element = (Element) node;
+        if (node instanceof TextNode textNode) {
+            text.append((textNode).text());
+        } else if (node instanceof Element element) {
             // <br> is a self-closing HTML tag used to insert a line break.
             if (element.tagName().equalsIgnoreCase("br")) {
                 // Add a newline for <br> tags
@@ -77,6 +82,7 @@ private void extractText(final Node node, final StringBuilder text) {
             extractText(child, text);
         }
     }
+    // CHECKSTYLE:ON
 
     public void build(final SharpStream ttml) throws IOException {
         /*
@@ -98,7 +104,7 @@ public void build(final SharpStream ttml) throws IOException {
         final Elements paragraphList = doc.select("body > div > p");
 
         // check if has frames
-        if (paragraphList.size() < 1) {
+        if (paragraphList.isEmpty()) {
             return;
         }
 

From 22ee01bcfbb8655a65fb6ee6f8373cec54987a4f Mon Sep 17 00:00:00 2001
From: TransZAllen <tree.story@outlook.com>
Date: Thu, 18 Sep 2025 20:43:14 +0800
Subject: [PATCH 3/7] refactor(ttml): improve extractText() to preserve spaces
 and special characters

- Replaced `text()` with `getWholeText()`:
  - avoids losing whitespaces at the beginning, end, or within the text;
  - avoids merging two or more consecutive spaces into a single space ' ';
  - avoids converting '\r', '\n', and '\r\n' within the text into a single space ' ';
  For subtitle conversion, the goal is to preserve every character exactly as intended by the subtitle author.
- Normalized tabs, line breaks, and other special characters for SRT-safe output.
- Added comprehensive unit tests in `SrtFromTtmlWriterTest.java`, including cases for simple and nested tags.
---
 .../newpipe/streams/SrtFromTtmlWriter.java    | 169 ++++++++-
 .../streams/SrtFromTtmlWriterTest.java        | 320 ++++++++++++++++++
 2 files changed, 488 insertions(+), 1 deletion(-)
 create mode 100644 app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java

diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
index 5f43185c695..ad1e4d13a2f 100644
--- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
+++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
@@ -54,6 +54,157 @@ private void writeString(final String text) throws IOException {
         out.write(text.getBytes(charset));
     }
 
+    /**
+     * Decode XML or HTML entities into their actual (literal) characters.
+     *
+     * TTML is XML-based, so text nodes may contain escaped entities
+     * instead of direct characters. For example:
+     *
+     *   "&amp;"          → "&"
+     *   "&lt;"           → "<"
+     *   "&gt;"           → ">"
+     *   "&#x9;"          → "\t" (TAB)
+     *   "&#xA;" (&#10;)  → "\n" (LINE FEED)
+     *
+     * XML files cannot contain characters like "<", ">", "&" directly,
+     * so they must be represented using their entity-encoded forms.
+     *
+     * Jsoup sometimes leaves nested or encoded entities unresolved
+     * (e.g. inside <p> text nodes in TTML files), so this function
+     * acts as a final “safety net” to ensure all entities are decoded
+     * before further normalization.
+     *
+     * Character representation layers for reference:
+     *   - Literal characters: <, >, &
+     *       → appear in runtime/output text (e.g. final SRT output)
+     *   - Escaped entities: &lt;, &gt;, &amp;
+     *       → appear in XML/HTML/TTML source files
+     *   - Numeric entities: &#xA0;, &#x9;, &#xD;
+     *       → appear mainly in XML/TTML files (also valid in HTML)
+     *         for non-printable or special characters
+     *   - Unicode escapes: \u00A0 (Java/Unicode internal form)
+     *       → appear only in Java source code (NOT valid in XML)
+     *
+     * XML entities include both named (&amp;, &lt;) and numeric
+     * (&#xA0;, &#160;) forms.
+     *
+     * @param encodedEntities The raw text fragment possibly containing
+     *                        encoded XML entities.
+     * @return A decoded string where all entities are replaced by their
+     *         actual (literal) characters.
+     */
+    private String decodeXmlEntities(final String encodedEntities) {
+        final String decoded = Parser.unescapeEntities(encodedEntities, true);
+        return decoded;
+    }
+
+    /**
+     * Handle rare XML entity characters like LF: &#xA;(`\n`)
+     * , CR: &#xD;(`\r`) and CRLF: (`\r\n`).
+     *
+     * These are technically valid in TTML (XML allows them)
+     * but unusual in practice, since most TTML line breaks
+     * are represented as <br/> tags instead.
+     * As a defensive approach, we normalize them:
+     *
+     * - Windows (\r\n), macOS (\r), and Unix (\n) → unified SRT NEW_LINE (\r\n)
+     *
+     * Although well-formed TTML normally encodes line breaks
+     * as <br/> tags, some auto-generated or malformed TTML files
+     * may embed literal newline entities (&#xA;, &#xD;). This
+     * normalization ensures these cases render properly in SRT
+     * players instead of breaking the subtitle structure.
+     *
+     * @param text To be normalized text with actual characters.
+     * @return Unified SRT NEW_LINE converted from all kinds of line breaks.
+     */
+    private String normalizeLineBreakForSrt(final String text) {
+        String cleaned = text;
+
+        // NOTE:
+        // The order of newline replacements must NOT change,
+        // or duplicated line breaks (e.g. \r\n → \n\n) will occur.
+        cleaned = cleaned.replace("\r\n", "\n")
+                         .replace("\r", "\n");
+
+        cleaned = cleaned.replace("\n", NEW_LINE);
+
+        return cleaned;
+    }
+
+    private String normalizeForSrt(final String actualText) {
+        String cleaned = actualText;
+
+        // Replace non-breaking space (\u00A0) with regular space ' '(\u0020).
+        // - YouTube TTML subtitles use both regular spaces (\u0020)
+        //   and non-breaking spaces (\u00A0).
+        // - SRT subtitles only support regular spaces (\u0020),
+        //   so \u00A0 may cause display issues.
+        // - \u00A0 and \u0020 are visually identical (i.e., they both
+        //   appear as spaces ' '), but they differ in Unicode encoding,
+        //   leading to test failures (e.g., ComparisonFailure).
+        // - Convert \u00A0 to \u0020 to ensure consistency in subtitle
+        //   formatting.
+        // - References:
+        //   - Unicode General Punctuation: https://unicode.org/charts/PDF/U2000.pdf
+        //   - TTML Spec: https://www.w3.org/TR/ttml2/
+        //   - SRT Format: https://en.wikipedia.org/wiki/SubRip
+        cleaned = cleaned.replace('\u00A0', ' ') // Non-breaking space
+                 .replace('\u202F', ' ') // Narrow no-break space
+                 .replace('\u205F', ' ') // Medium mathematical space
+                 .replace('\u3000', ' ') // Ideographic space
+                 // \u2000 ~ \u200A are whitespace characters (e.g.,
+                 // en space, em space), replaced with regular space (\u0020).
+                 .replaceAll("[\\u2000-\\u200A]", " "); // Whitespace characters
+
+        // \u200B ~ \u200F are a range of non-spacing characters
+        // (e.g., zero-width space, zero-width non-joiner, etc.),
+        // which have no effect in *.SRT files and may cause
+        // display issues.
+        // These characters are invisible to the human eye, and
+        // they still exist in the encoding, so they need to be
+        // removed.
+        // After removal, the actual content becomes completely
+        // empty "", meaning there are no characters left, just
+        // an empty space, which helps avoid formatting issues
+        // in subtitles.
+        cleaned = cleaned.replaceAll("[\\u200B-\\u200F]", ""); // Non-spacing characters
+
+        // Remove control characters (\u0000 ~ \u001F, except
+        // \n, \r, \t).
+        // - These are ASCII C0 control codes (e.g. \u0001 SOH,
+        //   \u0008 BS, \u001F US), invisible and irrelevant in
+        //   subtitles, may cause square boxes (?) in players.
+        // - Reference:
+        //   Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf)
+        //   ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters)
+        cleaned = cleaned.replaceAll("[\\u0000-\\u0008\\u000B\\u000C\\u000E-\\u001F]", "");
+
+        // Reasoning:
+        // - subtitle files generally don't require tabs for alignment.
+        // - Tabs can be displayed with varying widths across different
+        //   editors or platforms, which may cause display issues.
+        // - Replace it with a single space for consistent display
+        //   across different editors or platforms.
+        cleaned = cleaned.replace('\t', ' ');
+
+        cleaned = normalizeLineBreakForSrt(cleaned);
+
+        return cleaned;
+    }
+
+    private String sanitizeFragment(final String raw) {
+        if (null == raw) {
+            return "";
+        }
+
+        final String actualCharacters = decodeXmlEntities(raw);
+
+        final String srtSafeText = normalizeForSrt(actualCharacters);
+
+        return srtSafeText;
+    }
+
     // CHECKSTYLE:OFF checkstyle:JavadocStyle
     // checkstyle does not understand that span tags are inside a code block
     /**
@@ -67,9 +218,25 @@ private void writeString(final String text) throws IOException {
      * @param node the current node to process
      * @param text the {@link StringBuilder} to append the extracted text to
      */
+    // --------------------------------------------------------------------
+    // [INTERNAL NOTE] TTML text layer explanation
+    //
+    // TTML parsing involves multiple text "layers":
+    //   1. Raw XML entities (e.g., &lt;, &#xA0;) are decoded by Jsoup.
+    //   2. extractText() works on DOM TextNodes (already parsed strings).
+    //   3. sanitizeFragment() decodes remaining entities and fixes
+    //      Unicode quirks.
+    //   4. normalizeForSrt() ensures literal text is safe for SRT output.
+    //
+    // In short:
+    //   Jsoup handles XML-level syntax,
+    //   our code handles text-level normalization for subtitles.
+    // --------------------------------------------------------------------
     private void extractText(final Node node, final StringBuilder text) {
         if (node instanceof TextNode textNode) {
-            text.append((textNode).text());
+            String rawTtmlFragment = textNode.getWholeText();
+            String srtContent = sanitizeFragment(rawTtmlFragment);
+            text.append(srtContent);
         } else if (node instanceof Element element) {
             // <br> is a self-closing HTML tag used to insert a line break.
             if (element.tagName().equalsIgnoreCase("br")) {
diff --git a/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java b/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java
new file mode 100644
index 00000000000..755724f68a6
--- /dev/null
+++ b/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java
@@ -0,0 +1,320 @@
+package org.schabi.newpipe.streams;
+
+import org.junit.Test;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.parser.Parser;
+import java.io.ByteArrayInputStream;
+import java.lang.reflect.Method;
+import java.nio.charset.StandardCharsets;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Unit tests for {@link SrtFromTtmlWriter}.
+ *
+ * Tests focus on {@code extractText()} and its handling of TTML <p> elements.
+ * Note:
+ * - Uses reflection to call the private {@code extractText()} method.
+ * - Update {@code EXTRACT_TEXT_METHOD} if renamed.
+ *
+ * ---
+ * NOTE ABOUT ENTITIES VS UNICODE ESCAPES
+ *
+ * - In short:
+ *   * UNICODE ESCAPES → used in Java source (e.g. SrtFromTtmlWriter.java)
+ *   * ENTITIES → used in TTML strings (this test file)
+ *
+ * - TTML is an XML-based format. Real TTML subtitles often encode special
+ *   characters as XML entities (named or numeric), e.g.:
+ *       &amp;    → '&' (\u0026)
+ *       &lt;     → '<' (\u003C)
+ *       &#x9;    → tab (\u0009)
+ *       &#xA;    → line feed (\u000A)
+ *       &#xD;    → carriage return (\u000D)
+ *
+ * - Java source code uses **Unicode escapes** (e.g. "\u00A0") which are resolved
+ *   at compile time, so they do not represent real XML entities.
+ *
+ * - Purpose of these tests:
+ *   We simulate *real TTML input* as NewPipe receives it — i.e., strings that
+ *   still contain encoded XML entities (&#x9;, &#xA;, &#xD;, etc.).
+ *   The production code (`decodeXmlEntities()`) must convert these into their
+ *   actual Unicode characters before normalization.
+ */
+public class SrtFromTtmlWriterTest {
+    private static final String TTML_WRAPPER_START = "<tt><body><div>";
+    private static final String TTML_WRAPPER_END = "</div></body></tt>";
+    private static final String EXTRACT_TEXT_METHOD = "extractText";
+    // Please keep the same definition from `SrtFromTtmlWriter` class.
+    private static final String NEW_LINE = "\r\n";
+
+    /*
+     * TTML example for simple paragraph <p> without nested tags.
+     * <p begin="00:00:01.000" end="00:00:03.000" style="s2">Hello World!</p>
+     */
+    private static final String SIMPLE_TTML = "<p begin=\"00:00:01.000\" end=\"00:00:03.000\" "
+            + "style=\"s2\">Hello World!</p>";
+    /**
+     * TTML example with nested tags with <br>.
+     * <p begin="00:00:01.000" end="00:00:03.000"><span style="s4">Hello</span><br>World!</p>
+     */
+    private static final String NESTED_TTML = "<p begin=\"00:00:01.000\" end=\"00:00:03.000\">"
+            + "<span style=\"s4\">Hello</span><br>World!</p>";
+
+    /**
+     * TTML example with HTML entities.
+     * &lt; → <, &gt; → >, &amp; → &, &quot; → ", &apos; → '
+     * &#39; → '
+     * &#xA0; → ' '
+     */
+    private static final String ENTITY_TTML = "<p begin=\"00:00:05.000\" "
+            + "end=\"00:00:07.000\">"
+            + "&lt;tag&gt; &amp; &quot;text&quot;&apos;&apos;&#39;&#39;"
+            + "&#xA0;&#xA0;"
+            + "</p>";
+    /**
+     * TTML example with special characters:
+     * - Spaces appear at the beginning and end of the text.
+     * - Spaces are also present within the text (not just at the edges).
+     * - The text includes various HTML entities such as &nbsp;,
+     *   &amp;, &lt;, &gt;, etc.
+     * &nbsp; → non-breaking space (Unicode: '\u00A0', Entity: '&#xA0;')
+     */
+    private static final String SPECIAL_TTML = "<p begin=\"00:00:05.000\" end=\"00:00:07.000\">"
+            + "   ～~-Hello&nbsp;&nbsp;&amp;&amp;&lt;&lt;&gt;&gt;World!!   "
+            + "</p>";
+
+    /**
+     * TTML example with characters: tab.
+     * &#x9; → \t
+     * They are separated by '+' for clarity.
+     */
+    private static final String TAB_TTML = "<p begin=\"00:00:05.000\" "
+            + "end=\"00:00:07.000\">"
+            + "&#x9;&#x9;+&#x9;&#x9;+&#x9;&#x9;"
+            + "</p>";
+
+    /**
+     * TTML example with line endings.
+     * &#xD; → \r
+     */
+    private static final String LINE_ENDING_0_TTML = "<p begin=\"00:00:05.000\" "
+            + "end=\"00:00:07.000\">"
+            + "&#xD;&#xD;+&#xD;&#xD;+&#xD;&#xD;"
+            + "</p>";
+    // &#xA; → \n
+    private static final String LINE_ENDING_1_TTML = "<p begin=\"00:00:05.000\" "
+            + "end=\"00:00:07.000\">"
+            + "&#xA;&#xA;+&#xA;&#xA;+&#xA;&#xA;"
+            + "</p>";
+    private static final String LINE_ENDING_2_TTML =
+            "<p begin=\"00:00:05.000\" end=\"00:00:07.000\">"
+            + "&#xD;&#xA;+&#xD;&#xA;+&#xD;&#xA;"
+            + "</p>";
+
+    /**
+     * TTML example with control characters.
+     * For example:
+     *  → \u0001
+     * &#x001F; → \u001F
+     *
+     * These control characters, if included as raw Unicode(e.g. '\u0001'),
+     * are either invalid in XML or rendered as '?' when processed.
+     * To avoid issues, they should be encoded(e.g. '') in TTML file.
+     *
+     * - Reference:
+     *   Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf),
+     *   ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters).
+     *   and the defination of these characters can be known.
+     */
+    private static final String CONTROL_CHAR_TTML = "<p begin=\"00:00:05.000\" "
+            + "end=\"00:00:07.000\">"
+            + "+&#x0008;+&#x000B;+&#x000C;+&#x000E;+&#x001F;"
+            + "</p>";
+
+
+
+    private static final String EMPTY_TTML = "<p begin=\"00:00:01.000\" "
+            + "end=\"00:00:03.000\">"
+            + ""
+            + "</p>";
+
+    /**
+     * TTML example with Unicode space characters.
+     * These characters are encoded using character references
+     * (&#xXXXX;).
+     *
+     * Includes:
+     * (&#x202F;) '\u202F' → Narrow no-break space
+     * (&#x205F;) '\u205F' → Medium mathematical space
+     * (&#x3000;) '\u3000' → Ideographic space
+     * '\u2000' ~ '\u200A' are whitespace characters:
+     * (&#x2000;) '\u2000' → En quad
+     * (&#x2002;) '\u2002' → En space
+     * (&#x200A;) '\u200A' → Hair space
+     *
+     * Each character is separated by '+' for clarity.
+     */
+    private static final String UNICODE_SPACE_TTML = "<p begin=\"00:00:05.000\" "
+            + "end=\"00:00:07.000\">"
+            + "&#x202F;+&#x205F;+&#x3000;+&#x2000;+&#x2002;+&#x200A;"
+            + "</p>";
+
+    /**
+     * TTML example with non-spacing (invisible) characters.
+     * These are encoded using character references (&#xXXXX;).
+     *
+     * Includes:
+     * (&#x200B;)'\u200B' → Zero-width space (ZWSP)
+     * (&#x200E;)'\u200E' → Left-to-right mark (LRM)
+     * (&#x200F;)'\u200F' → Right-to-left mark (RLM)
+     *
+     * They don't display any characters to the human eye.
+     * '+' is used between them for clarity in test output.
+     */
+    private static final String NON_SPACING_TTML = "<p begin=\"00:00:05.000\" "
+            + "end=\"00:00:07.000\">"
+            + "&#x200B;+&#x200E;+&#x200F;"
+            + "</p>";
+
+    /**
+     * Parses TTML string into a JSoup Document and selects the first <p> element.
+     *
+     * @param ttmlContent TTML content (e.g., <p>...</p>)
+     * @return the first <p> element
+     * @throws Exception if parsing or reflection fails
+     */
+    private Element parseTtmlParagraph(final String ttmlContent) throws Exception {
+        final String ttml = TTML_WRAPPER_START + ttmlContent + TTML_WRAPPER_END;
+        final Document doc = Jsoup.parse(
+                new ByteArrayInputStream(ttml.getBytes(StandardCharsets.UTF_8)),
+                "UTF-8", "", Parser.xmlParser());
+        return doc.select("body > div > p").first();
+    }
+
+    /**
+     * Invokes private extractText method via reflection.
+     *
+     * @param writer SrtFromTtmlWriter instance
+     * @param paragraph <p> element to extract text from
+     * @param text StringBuilder to store extracted text
+     * @throws Exception if reflection fails
+     */
+    private void invokeExtractText(final SrtFromTtmlWriter writer, final Element paragraph,
+                                  final StringBuilder text) throws Exception {
+        final Method method = writer.getClass()
+                .getDeclaredMethod(EXTRACT_TEXT_METHOD, Node.class, StringBuilder.class);
+        method.setAccessible(true);
+        method.invoke(writer, paragraph, text);
+    }
+
+    private String extractTextFromTtml(final String ttmlInput) throws Exception {
+        final Element paragraph = parseTtmlParagraph(ttmlInput);
+        final StringBuilder text = new StringBuilder();
+        final SrtFromTtmlWriter writer = new SrtFromTtmlWriter(null, false);
+        invokeExtractText(writer, paragraph, text);
+
+        final String actualText = text.toString();
+        return actualText;
+    }
+
+    @Test
+    public void testExtractTextSimpleParagraph() throws Exception {
+        final String expected = "Hello World!";
+        final String actual = extractTextFromTtml(SIMPLE_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextNestedTags() throws Exception {
+        final String expected = "Hello\r\nWorld!";
+        final String actual = extractTextFromTtml(NESTED_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextWithEntity() throws Exception {
+        final String expected = "<tag> & \"text\"''''  ";
+        final String actual = extractTextFromTtml(ENTITY_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextWithSpecialCharacters() throws Exception {
+        final String expected = "   ～~-Hello  &&<<>>World!!   ";
+        final String actual = extractTextFromTtml(SPECIAL_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextWithTab() throws Exception {
+        final String expected = "  +  +  ";
+        final String actual = extractTextFromTtml(TAB_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextWithLineEnding0() throws Exception {
+        final String expected = NEW_LINE + NEW_LINE + "+"
+                                + NEW_LINE + NEW_LINE + "+"
+                                + NEW_LINE + NEW_LINE;
+        final String actual = extractTextFromTtml(LINE_ENDING_0_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextWithLineEnding1() throws Exception {
+        final String expected = NEW_LINE + NEW_LINE + "+"
+                                + NEW_LINE + NEW_LINE + "+"
+                                + NEW_LINE + NEW_LINE;
+        final String actual = extractTextFromTtml(LINE_ENDING_1_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextWithLineEnding2() throws Exception {
+        final String expected = NEW_LINE + "+"
+                                + NEW_LINE + "+"
+                                + NEW_LINE;
+        final String actual = extractTextFromTtml(LINE_ENDING_2_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextWithControlCharacters() throws Exception {
+        final String expected = "+++++";
+        final String actual = extractTextFromTtml(CONTROL_CHAR_TTML);
+        assertEquals(expected, actual);
+    }
+
+    /**
+    * Test case to ensure that extractText() does not throw an exception
+    * when there are no text in the TTML paragraph (i.e., the paragraph
+    * is empty).
+    *
+    * Note:
+    *   In the NewPipe, *.srt files will contain empty text lines by default.
+    */
+    @Test
+    public void testExtractTextWithEmpty() throws Exception {
+        final String expected = "";
+        final String actual = extractTextFromTtml(EMPTY_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextWithUnicodeSpaces() throws Exception {
+        final String expected = " + + + + + ";
+        final String actual = extractTextFromTtml(UNICODE_SPACE_TTML);
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void testExtractTextWithNonSpacingCharacters() throws Exception {
+        final String expected = "++";
+        final String actual = extractTextFromTtml(NON_SPACING_TTML);
+        assertEquals(expected, actual);
+    }
+}

From 35166676716c4e84f57977945651d99b0d5f0859 Mon Sep 17 00:00:00 2001
From: TransZAllen <tree.story@outlook.com>
Date: Fri, 17 Oct 2025 12:04:02 +0800
Subject: [PATCH 4/7] refactor(ttml): extract recursion into
 `traverseChildNodesForNestedTags()`

- Extracted child-node traversal logic from `extractText()`
  into a helper method `traverseChildNodesForNestedTags()`.
- No functional change.
---
 .../schabi/newpipe/streams/SrtFromTtmlWriter.java | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
index ad1e4d13a2f..bea3422fc2e 100644
--- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
+++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
@@ -205,6 +205,15 @@ private String sanitizeFragment(final String raw) {
         return srtSafeText;
     }
 
+    // Recursively process all child nodes to ensure text inside
+    // nested tags (e.g., <span>) is also extracted.
+    private void traverseChildNodesForNestedTags(final Node parent,
+                                                 final StringBuilder text) {
+        for (final Node child : parent.childNodes()) {
+            extractText(child, text);
+        }
+    }
+
     // CHECKSTYLE:OFF checkstyle:JavadocStyle
     // checkstyle does not understand that span tags are inside a code block
     /**
@@ -244,10 +253,8 @@ private void extractText(final Node node, final StringBuilder text) {
                 text.append(NEW_LINE);
             }
         }
-        // Recursively process child nodes
-        for (final Node child : node.childNodes()) {
-            extractText(child, text);
-        }
+
+        traverseChildNodesForNestedTags(node, text);
     }
     // CHECKSTYLE:ON
 

From 71aa6d52d321110a530e44df38fc4a501de02c2c Mon Sep 17 00:00:00 2001
From: TransZAllen <tree.story@outlook.com>
Date: Tue, 28 Oct 2025 17:39:04 +0800
Subject: [PATCH 5/7] Update
 app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java

Co-authored-by: Tobi <TobiGr@users.noreply.github.com>
---
 .../java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
index bea3422fc2e..aaf7bff696e 100644
--- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
+++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
@@ -99,8 +99,8 @@ private String decodeXmlEntities(final String encodedEntities) {
     }
 
     /**
-     * Handle rare XML entity characters like LF: &#xA;(`\n`)
-     * , CR: &#xD;(`\r`) and CRLF: (`\r\n`).
+     * Handle rare XML entity characters like LF: &#xA;(`\n`),
+     * CR: &#xD;(`\r`) and CRLF: (`\r\n`).
      *
      * These are technically valid in TTML (XML allows them)
      * but unusual in practice, since most TTML line breaks

From d311faea58a3ee03e95b3473924547c87ccded11 Mon Sep 17 00:00:00 2001
From: TransZAllen <tree.story@outlook.com>
Date: Wed, 29 Oct 2025 18:52:57 +0800
Subject: [PATCH 6/7] =?UTF-8?q?improve=20comments=20on=20TTML=20=E2=86=92?=
 =?UTF-8?q?=20SRT=20conversion?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- update class header with proper technical references and remove author tag.
- update comments of replacing NBSP('\u00A0'), especially adding examples
  of rendering incorrectly.
---
 .../newpipe/streams/SrtFromTtmlWriter.java    | 39 ++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
index aaf7bff696e..6f584d055c5 100644
--- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
+++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
@@ -15,7 +15,11 @@
 import java.nio.charset.StandardCharsets;
 
 /**
- * @author kapodamy
+ * Converts TTML subtitles to SRT format.
+ *
+ * References:
+ *  - TTML 2.0 (W3C): https://www.w3.org/TR/ttml2/
+ *  - SRT format: https://en.wikipedia.org/wiki/SubRip
  */
 public class SrtFromTtmlWriter {
     private static final String NEW_LINE = "\r\n";
@@ -135,20 +139,37 @@ private String normalizeLineBreakForSrt(final String text) {
     private String normalizeForSrt(final String actualText) {
         String cleaned = actualText;
 
-        // Replace non-breaking space (\u00A0) with regular space ' '(\u0020).
+        // Replace NBSP "non-breaking space" (\u00A0) with regular space ' '(\u0020).
+        //
+        // Why:
+        // - Some viewers render NBSP(\u00A0) incorrectly:
+        //   * MPlayer 1.5: shown as “??”
+        //   * Linux command `cat -A`: displayed as control-like markers
+        //     (M-BM-)
+        //   * Acode (Android editor): displayed as visible replacement
+        //     glyphs (red dots)
+        // - Other viewers show it as a normal space (e.g., VS Code 1.104.0,
+        //   vlc 3.0.20, mpv 0.37.0, Totem 43.0)
+        // → Mixed rendering creates inconsistency and may confuse users.
+        //
+        // Details:
         // - YouTube TTML subtitles use both regular spaces (\u0020)
         //   and non-breaking spaces (\u00A0).
         // - SRT subtitles only support regular spaces (\u0020),
         //   so \u00A0 may cause display issues.
         // - \u00A0 and \u0020 are visually identical (i.e., they both
         //   appear as spaces ' '), but they differ in Unicode encoding,
-        //   leading to test failures (e.g., ComparisonFailure).
-        // - Convert \u00A0 to \u0020 to ensure consistency in subtitle
-        //   formatting.
-        // - References:
-        //   - Unicode General Punctuation: https://unicode.org/charts/PDF/U2000.pdf
-        //   - TTML Spec: https://www.w3.org/TR/ttml2/
-        //   - SRT Format: https://en.wikipedia.org/wiki/SubRip
+        //   and NBSP (\u00A0) renders differently in different viewers.
+        // - SRT is a plain-text format and does not interpret
+        //   "non-breaking" behavior.
+        //
+        // Conclusion:
+        // - Ensure uniform behavior, so replace it to a regular space
+        //   without "non-breaking" behavior.
+        //
+        // References:
+        //   - Unicode U+00A0 NBSP (Latin-1 Supplement):
+        //     https://unicode.org/charts/PDF/U0080.pdf
         cleaned = cleaned.replace('\u00A0', ' ') // Non-breaking space
                  .replace('\u202F', ' ') // Narrow no-break space
                  .replace('\u205F', ' ') // Medium mathematical space

From 300afde83d7203187d6249cd203a977f8874b5e4 Mon Sep 17 00:00:00 2001
From: TransZAllen <tree.story@outlook.com>
Date: Wed, 29 Oct 2025 22:34:47 +0800
Subject: [PATCH 7/7] Update
 app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java

Co-authored-by: Tobi <TobiGr@users.noreply.github.com>
---
 .../java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
index 6f584d055c5..652053e45db 100644
--- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
+++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
@@ -98,8 +98,7 @@ private void writeString(final String text) throws IOException {
      *         actual (literal) characters.
      */
     private String decodeXmlEntities(final String encodedEntities) {
-        final String decoded = Parser.unescapeEntities(encodedEntities, true);
-        return decoded;
+        return Parser.unescapeEntities(encodedEntities, true);
     }
 
     /**