Skip to content

Commit df398c4

Browse files
committed
ComicInfo parser
1 parent 24fad0d commit df398c4

File tree

6 files changed

+1027
-10
lines changed

6 files changed

+1027
-10
lines changed

CHANGELOG.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,14 @@
22

33
All notable changes to this project will be documented in this file. Take a look at [the migration guide](docs/Migration%20Guide.md) to upgrade between two major versions.
44

5-
<!-- ## [Unreleased] -->
5+
## [Unreleased]
6+
7+
### Added
8+
9+
#### Streamer
10+
11+
* The `ImageParser` now extracts metadata from `ComicInfo.xml` files in CBZ archives.
12+
613

714
## [3.6.0]
815

Lines changed: 358 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,358 @@
1+
//
2+
// Copyright 2026 Readium Foundation. All rights reserved.
3+
// Use of this source code is governed by the BSD-style license
4+
// available in the top-level LICENSE file of the project.
5+
//
6+
7+
import Foundation
8+
import ReadiumFuzi
9+
import ReadiumShared
10+
11+
/// Parses ComicInfo.xml metadata from CBZ archives.
12+
///
13+
/// ComicInfo.xml is a metadata format originating from the ComicRack
14+
/// application.
15+
/// See: https://anansi-project.github.io/docs/comicinfo/documentation
16+
struct ComicInfoParser {
17+
/// Parses ComicInfo.xml data and returns the parsed metadata.
18+
static func parse(data: Data, warnings: WarningLogger?) -> ComicInfo? {
19+
guard let document = try? XMLDocument(data: data) else {
20+
warnings?.log(ComicInfoWarning(message: "Failed to parse ComicInfo.xml"))
21+
return nil
22+
}
23+
24+
guard let root = document.root, root.tag == "ComicInfo" else {
25+
warnings?.log(ComicInfoWarning(message: "ComicInfo.xml root element is not <ComicInfo>"))
26+
return nil
27+
}
28+
29+
return ComicInfo(element: root)
30+
}
31+
}
32+
33+
/// Warning raised when parsing a ComicInfo.xml file.
34+
struct ComicInfoWarning: Warning {
35+
let message: String
36+
var severity: WarningSeverityLevel { .minor }
37+
var tag: String { "comicinfo" }
38+
}
39+
40+
/// Parsed representation of ComicInfo.xml data.
41+
///
42+
/// Only metadata fields that map to RWPM are exposed as first-class properties.
43+
/// All other fields are available in the `otherMetadata` dictionary.
44+
///
45+
/// See https://anansi-project.github.io/docs/comicinfo/documentation
46+
struct ComicInfo {
47+
/// Title of the book.
48+
var title: String?
49+
50+
/// Title of the series the book is part of.
51+
var series: String?
52+
53+
/// Number of the book in the series.
54+
var number: String?
55+
56+
/// Alternate series name, used for cross-over story arcs.
57+
var alternateSeries: String?
58+
59+
/// Number of the book in the alternate series.
60+
var alternateNumber: String?
61+
62+
/// A description or summary of the book.
63+
var summary: String?
64+
65+
/// Person or organization responsible for publishing, releasing, or
66+
/// issuing a resource.
67+
var publisher: String?
68+
69+
/// An imprint is a group of publications under the umbrella of a larger
70+
/// imprint or publisher.
71+
var imprint: String?
72+
73+
/// Release year of the book.
74+
var year: Int?
75+
76+
/// Release month of the book.
77+
var month: Int?
78+
79+
/// Release day of the book.
80+
var day: Int?
81+
82+
/// Language of the book using IETF BCP 47 language tags.
83+
var languageISO: String?
84+
85+
/// Global Trade Item Number identifying the book (ISBN, EAN, etc.).
86+
var gtin: String?
87+
88+
/// People or organizations responsible for creating the scenario.
89+
var writers: [String] = []
90+
91+
/// People or organizations responsible for drawing the art.
92+
var pencillers: [String] = []
93+
94+
/// People or organizations responsible for inking the pencil art.
95+
var inkers: [String] = []
96+
97+
/// People or organizations responsible for applying color to drawings.
98+
var colorists: [String] = []
99+
100+
/// People or organizations responsible for drawing text and speech bubbles.
101+
var letterers: [String] = []
102+
103+
/// People or organizations responsible for drawing the cover art.
104+
var coverArtists: [String] = []
105+
106+
/// People or organizations responsible for preparing the resource for
107+
/// production.
108+
var editors: [String] = []
109+
110+
/// People or organizations responsible for rendering text from one language
111+
/// into another.
112+
var translators: [String] = []
113+
114+
/// Genres of the book or series (e.g., Science-Fiction, Shonen).
115+
var genres: [String] = []
116+
117+
/// Whether the book is a manga. The value `.yesAndRightToLeft` indicates
118+
/// right-to-left reading direction.
119+
var manga: Manga?
120+
121+
/// Page information parsed from the <Pages> element.
122+
var pages: [PageInfo] = []
123+
124+
/// Returns the first page with the given type, if any.
125+
func firstPageWithType(_ type: PageType) -> PageInfo? {
126+
pages.first { $0.type == type }
127+
}
128+
129+
/// All other metadata fields not directly mapped to RWPM.
130+
///
131+
/// Keys are the XML tag names (e.g., "Volume", "Characters", "AgeRating").
132+
/// Values are strings as they appear in the XML.
133+
var otherMetadata: [String: String] = [:]
134+
135+
/// URL prefix for otherMetadata keys when converting to RWPM.
136+
private static let otherMetadataPrefix = "https://anansi-project.github.io/docs/comicinfo/documentation#"
137+
138+
init(element: ReadiumFuzi.XMLElement) {
139+
for child in element.children {
140+
guard let tag = child.tag else { continue }
141+
142+
// Pages element has no text content, only child elements
143+
if tag == "Pages" {
144+
pages = child.children(tag: "Page").compactMap { PageInfo(element: $0) }
145+
continue
146+
}
147+
148+
let value = child.stringValue.trimmingCharacters(in: .whitespacesAndNewlines)
149+
guard !value.isEmpty else { continue }
150+
151+
switch tag {
152+
// Core
153+
case "AlternateNumber": alternateNumber = value
154+
case "AlternateSeries": alternateSeries = value
155+
case "Day": day = Int(value)
156+
case "GTIN": gtin = value
157+
case "Genre": genres = value.splitComma()
158+
case "Imprint": imprint = value
159+
case "LanguageISO": languageISO = value
160+
case "Manga": manga = Manga(rawValue: value)
161+
case "Month": month = Int(value)
162+
case "Number": number = value
163+
case "Publisher": publisher = value
164+
case "Series": series = value
165+
case "Summary": summary = value
166+
case "Title": title = value
167+
case "Year": year = Int(value)
168+
169+
// Contributors
170+
case "Colorist": colorists = value.splitComma()
171+
case "CoverArtist": coverArtists = value.splitComma()
172+
case "Editor": editors = value.splitComma()
173+
case "Inker": inkers = value.splitComma()
174+
case "Letterer": letterers = value.splitComma()
175+
case "Penciller": pencillers = value.splitComma()
176+
case "Translator": translators = value.splitComma()
177+
case "Writer": writers = value.splitComma()
178+
179+
// Everything else goes to otherMetadata
180+
default: otherMetadata[tag] = value
181+
}
182+
}
183+
}
184+
185+
/// Converts to RWPM Metadata.
186+
func toMetadata() -> Metadata {
187+
// Build published date from year/month/day
188+
var published: Date?
189+
if let year = year {
190+
var components = DateComponents()
191+
components.year = year
192+
components.month = month ?? 1
193+
components.day = day ?? 1
194+
published = Calendar(identifier: .gregorian).date(from: components)
195+
}
196+
197+
// Parse series
198+
var belongsToSeries: [Contributor] = []
199+
if let series = series {
200+
let position = number.flatMap { Double($0) }
201+
belongsToSeries.append(Contributor(name: series, position: position))
202+
}
203+
if let alternateSeries = alternateSeries {
204+
let position = alternateNumber.flatMap { Double($0) }
205+
belongsToSeries.append(Contributor(name: alternateSeries, position: position))
206+
}
207+
208+
// Build other metadata with specification URL prefix
209+
var rwpmOtherMetadata: [String: Any] = [:]
210+
for (key, value) in otherMetadata {
211+
rwpmOtherMetadata[Self.otherMetadataPrefix + key.lowercased()] = value
212+
}
213+
214+
return Metadata(
215+
identifier: gtin,
216+
conformsTo: [.divina],
217+
title: title,
218+
published: published,
219+
languages: languageISO.map { [$0] } ?? [],
220+
subjects: genres.map { Subject(name: $0) },
221+
authors: writers.map { Contributor(name: $0) },
222+
translators: translators.map { Contributor(name: $0) },
223+
editors: editors.map { Contributor(name: $0) },
224+
letterers: letterers.map { Contributor(name: $0) },
225+
pencilers: pencillers.map { Contributor(name: $0) },
226+
colorists: colorists.map { Contributor(name: $0) },
227+
inkers: inkers.map { Contributor(name: $0) },
228+
contributors: coverArtists.map { Contributor(name: $0, role: "cov") },
229+
publishers: publisher.map { [Contributor(name: $0)] } ?? [],
230+
imprints: imprint.map { [Contributor(name: $0)] } ?? [],
231+
readingProgression: (manga == .yesAndRightToLeft) ? .rtl : .auto,
232+
description: summary,
233+
belongsToSeries: belongsToSeries,
234+
otherMetadata: rwpmOtherMetadata
235+
)
236+
}
237+
238+
// MARK: - ComicInfo Types
239+
240+
/// Page type values from the ComicInfo specification.
241+
///
242+
/// See: https://anansi-project.github.io/docs/comicinfo/documentation#type
243+
enum PageType: Hashable, Sendable {
244+
case frontCover
245+
case innerCover
246+
case roundup
247+
case story
248+
case advertisement
249+
case editorial
250+
case letters
251+
case preview
252+
case backCover
253+
case other
254+
case deleted
255+
256+
/// Case-insensitive initializer.
257+
init?(rawValue: String) {
258+
switch rawValue.lowercased() {
259+
case "frontcover": self = .frontCover
260+
case "innercover": self = .innerCover
261+
case "roundup": self = .roundup
262+
case "story": self = .story
263+
case "advertisement": self = .advertisement
264+
case "editorial": self = .editorial
265+
case "letters": self = .letters
266+
case "preview": self = .preview
267+
case "backcover": self = .backCover
268+
case "other": self = .other
269+
case "deleted", "delete": self = .deleted
270+
default: return nil
271+
}
272+
}
273+
}
274+
275+
/// Information about a single page from ComicInfo.xml.
276+
///
277+
/// See: https://anansi-project.github.io/docs/comicinfo/documentation#pages--comicpageinfo
278+
struct PageInfo: Hashable, Sendable {
279+
/// Zero-based index of this page in the reading order.
280+
let image: Int
281+
282+
/// The type/purpose of this page.
283+
let type: PageType?
284+
285+
/// Whether this is a double-page spread.
286+
let doublePage: Bool?
287+
288+
/// File size in bytes.
289+
let imageSize: Int64?
290+
291+
/// Page key/identifier.
292+
let key: String?
293+
294+
/// Bookmark name for this page.
295+
let bookmark: String?
296+
297+
/// Width of the page image in pixels.
298+
let imageWidth: Int?
299+
300+
/// Height of the page image in pixels.
301+
let imageHeight: Int?
302+
303+
/// Parses a PageInfo from an XML <Page> element.
304+
init?(element: ReadiumFuzi.XMLElement) {
305+
guard
306+
let imageStr = element.attr("Image"),
307+
let image = Int(imageStr)
308+
else {
309+
return nil
310+
}
311+
312+
self.image = image
313+
type = element.attr("Type").flatMap { PageType(rawValue: $0) }
314+
doublePage = element.attr("DoublePage").flatMap {
315+
switch $0.lowercased() {
316+
case "true", "1": return true
317+
case "false", "0": return false
318+
default: return nil
319+
}
320+
}
321+
imageSize = element.attr("ImageSize").flatMap { Int64($0) }
322+
key = element.attr("Key")
323+
bookmark = element.attr("Bookmark")
324+
imageWidth = element.attr("ImageWidth").flatMap { Int($0) }
325+
imageHeight = element.attr("ImageHeight").flatMap { Int($0) }
326+
}
327+
}
328+
329+
/// Manga field values indicating whether the book is a manga and its
330+
/// reading direction.
331+
///
332+
/// See: https://anansi-project.github.io/docs/comicinfo/documentation#manga
333+
enum Manga {
334+
case unknown
335+
case no
336+
case yes
337+
case yesAndRightToLeft
338+
339+
/// Case-insensitive initializer.
340+
init?(rawValue: String) {
341+
switch rawValue.lowercased() {
342+
case "unknown": self = .unknown
343+
case "no": self = .no
344+
case "yes": self = .yes
345+
case "yesandrighttoleft": self = .yesAndRightToLeft
346+
default: return nil
347+
}
348+
}
349+
}
350+
}
351+
352+
private extension String {
353+
func splitComma() -> [String] {
354+
split(separator: ",")
355+
.map { $0.trimmingCharacters(in: .whitespaces) }
356+
.filter { !$0.isEmpty }
357+
}
358+
}

0 commit comments

Comments
 (0)