diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index 7aff655a030..652053e45db 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -15,7 +15,11 @@ import java.nio.charset.StandardCharsets; /** - * @author kapodamy + * Converts TTML subtitles to SRT format. + * + * References: + * - TTML 2.0 (W3C): https://www.w3.org/TR/ttml2/ + * - SRT format: https://en.wikipedia.org/wiki/SubRip */ public class SrtFromTtmlWriter { private static final String NEW_LINE = "\r\n"; @@ -54,6 +58,226 @@ private void writeString(final String text) throws IOException { out.write(text.getBytes(charset)); } + /** + * Decode XML or HTML entities into their actual (literal) characters. + * + * TTML is XML-based, so text nodes may contain escaped entities + * instead of direct characters. For example: + * + * "&" → "&" + * "<" → "<" + * ">" → ">" + * " " → "\t" (TAB) + * " " ( ) → "\n" (LINE FEED) + * + * XML files cannot contain characters like "<", ">", "&" directly, + * so they must be represented using their entity-encoded forms. + * + * Jsoup sometimes leaves nested or encoded entities unresolved + * (e.g. inside
text nodes in TTML files), so this function
+ * acts as a final “safety net” to ensure all entities are decoded
+ * before further normalization.
+ *
+ * Character representation layers for reference:
+ * - Literal characters: <, >, &
+ * → appear in runtime/output text (e.g. final SRT output)
+ * - Escaped entities: <, >, &
+ * → appear in XML/HTML/TTML source files
+ * - Numeric entities: , ,
+ * → appear mainly in XML/TTML files (also valid in HTML)
+ * for non-printable or special characters
+ * - Unicode escapes: \u00A0 (Java/Unicode internal form)
+ * → appear only in Java source code (NOT valid in XML)
+ *
+ * XML entities include both named (&, <) and numeric
+ * ( , ) forms.
+ *
+ * @param encodedEntities The raw text fragment possibly containing
+ * encoded XML entities.
+ * @return A decoded string where all entities are replaced by their
+ * actual (literal) characters.
+ */
+ private String decodeXmlEntities(final String encodedEntities) {
+ return Parser.unescapeEntities(encodedEntities, true);
+ }
+
+ /**
+ * Handle rare XML entity characters like LF:
(`\n`),
+ * CR:
(`\r`) and CRLF: (`\r\n`).
+ *
+ * These are technically valid in TTML (XML allows them)
+ * but unusual in practice, since most TTML line breaks
+ * are represented as Recursive method to extract text from all nodes.
+ * This method processes {@link TextNode}s and {@code elements.
+ * Note:
+ * - Uses reflection to call the private {@code extractText()} method.
+ * - Update {@code EXTRACT_TEXT_METHOD} if renamed.
+ *
+ * ---
+ * NOTE ABOUT ENTITIES VS UNICODE ESCAPES
+ *
+ * - In short:
+ * * UNICODE ESCAPES → used in Java source (e.g. SrtFromTtmlWriter.java)
+ * * ENTITIES → used in TTML strings (this test file)
+ *
+ * - TTML is an XML-based format. Real TTML subtitles often encode special
+ * characters as XML entities (named or numeric), e.g.:
+ * & → '&' (\u0026)
+ * < → '<' (\u003C)
+ * → tab (\u0009)
+ *
→ line feed (\u000A)
+ *
→ carriage return (\u000D)
+ *
+ * - Java source code uses **Unicode escapes** (e.g. "\u00A0") which are resolved
+ * at compile time, so they do not represent real XML entities.
+ *
+ * - Purpose of these tests:
+ * We simulate *real TTML input* as NewPipe receives it — i.e., strings that
+ * still contain encoded XML entities ( ,
,
, etc.).
+ * The production code (`decodeXmlEntities()`) must convert these into their
+ * actual Unicode characters before normalization.
+ */
+public class SrtFromTtmlWriterTest {
+ private static final String TTML_WRAPPER_START = " without nested tags.
+ * Hello World! Hello World! Hello "
+ + "Hello "
+ + "<tag> & "text"''''"
+ + " "
+ + " "
+ + " ~~-Hello &&<<>>World!! "
+ + " "
+ + " + + "
+ + " "
+ + "
+
+
"
+ + " "
+ + "
+
+
"
+ + " "
+ + "
+
+
"
+ + " "
+ + "+++++"
+ + " "
+ + ""
+ + " "
+ + " + + + + + "
+ + " "
+ + "++"
+ + " element.
+ *
+ * @param ttmlContent TTML content (e.g., ... element
+ * @throws Exception if parsing or reflection fails
+ */
+ private Element parseTtmlParagraph(final String ttmlContent) throws Exception {
+ final String ttml = TTML_WRAPPER_START + ttmlContent + TTML_WRAPPER_END;
+ final Document doc = Jsoup.parse(
+ new ByteArrayInputStream(ttml.getBytes(StandardCharsets.UTF_8)),
+ "UTF-8", "", Parser.xmlParser());
+ return doc.select("body > div > p").first();
+ }
+
+ /**
+ * Invokes private extractText method via reflection.
+ *
+ * @param writer SrtFromTtmlWriter instance
+ * @param paragraph element to extract text from
+ * @param text StringBuilder to store extracted text
+ * @throws Exception if reflection fails
+ */
+ private void invokeExtractText(final SrtFromTtmlWriter writer, final Element paragraph,
+ final StringBuilder text) throws Exception {
+ final Method method = writer.getClass()
+ .getDeclaredMethod(EXTRACT_TEXT_METHOD, Node.class, StringBuilder.class);
+ method.setAccessible(true);
+ method.invoke(writer, paragraph, text);
+ }
+
+ private String extractTextFromTtml(final String ttmlInput) throws Exception {
+ final Element paragraph = parseTtmlParagraph(ttmlInput);
+ final StringBuilder text = new StringBuilder();
+ final SrtFromTtmlWriter writer = new SrtFromTtmlWriter(null, false);
+ invokeExtractText(writer, paragraph, text);
+
+ final String actualText = text.toString();
+ return actualText;
+ }
+
+ @Test
+ public void testExtractTextSimpleParagraph() throws Exception {
+ final String expected = "Hello World!";
+ final String actual = extractTextFromTtml(SIMPLE_TTML);
+ assertEquals(expected, actual);
+ }
+
+ @Test
+ public void testExtractTextNestedTags() throws Exception {
+ final String expected = "Hello\r\nWorld!";
+ final String actual = extractTextFromTtml(NESTED_TTML);
+ assertEquals(expected, actual);
+ }
+
+ @Test
+ public void testExtractTextWithEntity() throws Exception {
+ final String expected = "
tags instead.
+ * As a defensive approach, we normalize them:
+ *
+ * - Windows (\r\n), macOS (\r), and Unix (\n) → unified SRT NEW_LINE (\r\n)
+ *
+ * Although well-formed TTML normally encodes line breaks
+ * as
tags, some auto-generated or malformed TTML files
+ * may embed literal newline entities (
,
). This
+ * normalization ensures these cases render properly in SRT
+ * players instead of breaking the subtitle structure.
+ *
+ * @param text To be normalized text with actual characters.
+ * @return Unified SRT NEW_LINE converted from all kinds of line breaks.
+ */
+ private String normalizeLineBreakForSrt(final String text) {
+ String cleaned = text;
+
+ // NOTE:
+ // The order of newline replacements must NOT change,
+ // or duplicated line breaks (e.g. \r\n → \n\n) will occur.
+ cleaned = cleaned.replace("\r\n", "\n")
+ .replace("\r", "\n");
+
+ cleaned = cleaned.replace("\n", NEW_LINE);
+
+ return cleaned;
+ }
+
+ private String normalizeForSrt(final String actualText) {
+ String cleaned = actualText;
+
+ // Replace NBSP "non-breaking space" (\u00A0) with regular space ' '(\u0020).
+ //
+ // Why:
+ // - Some viewers render NBSP(\u00A0) incorrectly:
+ // * MPlayer 1.5: shown as “??”
+ // * Linux command `cat -A`: displayed as control-like markers
+ // (M-BM-)
+ // * Acode (Android editor): displayed as visible replacement
+ // glyphs (red dots)
+ // - Other viewers show it as a normal space (e.g., VS Code 1.104.0,
+ // vlc 3.0.20, mpv 0.37.0, Totem 43.0)
+ // → Mixed rendering creates inconsistency and may confuse users.
+ //
+ // Details:
+ // - YouTube TTML subtitles use both regular spaces (\u0020)
+ // and non-breaking spaces (\u00A0).
+ // - SRT subtitles only support regular spaces (\u0020),
+ // so \u00A0 may cause display issues.
+ // - \u00A0 and \u0020 are visually identical (i.e., they both
+ // appear as spaces ' '), but they differ in Unicode encoding,
+ // and NBSP (\u00A0) renders differently in different viewers.
+ // - SRT is a plain-text format and does not interpret
+ // "non-breaking" behavior.
+ //
+ // Conclusion:
+ // - Ensure uniform behavior, so replace it to a regular space
+ // without "non-breaking" behavior.
+ //
+ // References:
+ // - Unicode U+00A0 NBSP (Latin-1 Supplement):
+ // https://unicode.org/charts/PDF/U0080.pdf
+ cleaned = cleaned.replace('\u00A0', ' ') // Non-breaking space
+ .replace('\u202F', ' ') // Narrow no-break space
+ .replace('\u205F', ' ') // Medium mathematical space
+ .replace('\u3000', ' ') // Ideographic space
+ // \u2000 ~ \u200A are whitespace characters (e.g.,
+ // en space, em space), replaced with regular space (\u0020).
+ .replaceAll("[\\u2000-\\u200A]", " "); // Whitespace characters
+
+ // \u200B ~ \u200F are a range of non-spacing characters
+ // (e.g., zero-width space, zero-width non-joiner, etc.),
+ // which have no effect in *.SRT files and may cause
+ // display issues.
+ // These characters are invisible to the human eye, and
+ // they still exist in the encoding, so they need to be
+ // removed.
+ // After removal, the actual content becomes completely
+ // empty "", meaning there are no characters left, just
+ // an empty space, which helps avoid formatting issues
+ // in subtitles.
+ cleaned = cleaned.replaceAll("[\\u200B-\\u200F]", ""); // Non-spacing characters
+
+ // Remove control characters (\u0000 ~ \u001F, except
+ // \n, \r, \t).
+ // - These are ASCII C0 control codes (e.g. \u0001 SOH,
+ // \u0008 BS, \u001F US), invisible and irrelevant in
+ // subtitles, may cause square boxes (?) in players.
+ // - Reference:
+ // Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf)
+ // ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters)
+ cleaned = cleaned.replaceAll("[\\u0000-\\u0008\\u000B\\u000C\\u000E-\\u001F]", "");
+
+ // Reasoning:
+ // - subtitle files generally don't require tabs for alignment.
+ // - Tabs can be displayed with varying widths across different
+ // editors or platforms, which may cause display issues.
+ // - Replace it with a single space for consistent display
+ // across different editors or platforms.
+ cleaned = cleaned.replace('\t', ' ');
+
+ cleaned = normalizeLineBreakForSrt(cleaned);
+
+ return cleaned;
+ }
+
+ private String sanitizeFragment(final String raw) {
+ if (null == raw) {
+ return "";
+ }
+
+ final String actualCharacters = decodeXmlEntities(raw);
+
+ final String srtSafeText = normalizeForSrt(actualCharacters);
+
+ return srtSafeText;
+ }
+
+ // Recursively process all child nodes to ensure text inside
+ // nested tags (e.g., ) is also extracted.
+ private void traverseChildNodesForNestedTags(final Node parent,
+ final StringBuilder text) {
+ for (final Node child : parent.childNodes()) {
+ extractText(child, text);
+ }
+ }
+
+ // CHECKSTYLE:OFF checkstyle:JavadocStyle
+ // checkstyle does not understand that span tags are inside a code block
+ /**
+ *
} tags,
+ * recursively extracting text from nested tags
+ * (e.g. extracting text from nested {@code } tags).
+ * Newlines are added for {@code
} tags.
+ *
is a self-closing HTML tag used to insert a line break.
+ if (element.tagName().equalsIgnoreCase("br")) {
+ // Add a newline for
tags
+ text.append(NEW_LINE);
+ }
+ }
+
+ traverseChildNodesForNestedTags(node, text);
+ }
+ // CHECKSTYLE:ON
+
public void build(final SharpStream ttml) throws IOException {
/*
* TTML parser with BASIC support
@@ -74,21 +298,15 @@ public void build(final SharpStream ttml) throws IOException {
final Elements paragraphList = doc.select("body > div > p");
// check if has frames
- if (paragraphList.size() < 1) {
+ if (paragraphList.isEmpty()) {
return;
}
for (final Element paragraph : paragraphList) {
text.setLength(0);
- for (final Node children : paragraph.childNodes()) {
- if (children instanceof TextNode) {
- text.append(((TextNode) children).text());
- } else if (children instanceof Element
- && ((Element) children).tagName().equalsIgnoreCase("br")) {
- text.append(NEW_LINE);
- }
- }
+ // Recursively extract text from all child nodes
+ extractText(paragraph, text);
if (ignoreEmptyFrames && text.length() < 1) {
continue;
diff --git a/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java b/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java
new file mode 100644
index 00000000000..755724f68a6
--- /dev/null
+++ b/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java
@@ -0,0 +1,320 @@
+package org.schabi.newpipe.streams;
+
+import org.junit.Test;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.parser.Parser;
+import java.io.ByteArrayInputStream;
+import java.lang.reflect.Method;
+import java.nio.charset.StandardCharsets;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Unit tests for {@link SrtFromTtmlWriter}.
+ *
+ * Tests focus on {@code extractText()} and its handling of TTML
.
+ *
World!
World!