@@ -757,6 +757,197 @@ public struct KokoroSynthesizer {
757757 )
758758 }
759759
760+ /// Synthesize directly from a Kokoro-zh phoneme string (one codepoint per token).
761+ /// This bypasses English lexicon + chunking and is useful for Mandarin.
762+ public static func synthesizePhonemeStringDetailed(
763+ phonemes: String ,
764+ voice: String = TtsConstants . recommendedVoice,
765+ voiceSpeed: Float = 1.0 ,
766+ variantPreference: ModelNames . TTS . Variant ? = . fifteenSecond
767+ ) async throws -> SynthesisResult {
768+ logger. info ( " Starting synthesis from phoneme string; length= \( phonemes. count) " )
769+
770+ try await ensureRequiredFiles ( )
771+ if !isVoiceEmbeddingPayloadCached( for: voice) {
772+ try ? await TtsResourceDownloader . ensureVoiceEmbedding ( voice: voice)
773+ }
774+ try await loadModel ( variant: variantPreference)
775+
776+ let modelCache = try currentModelCache ( )
777+ let vocabulary = try await KokoroVocabulary . shared. getVocabulary ( )
778+ let capacities = try await capacities ( for: variantPreference)
779+ let lexiconMetrics = await lexiconCache. metrics ( )
780+
781+ // Build a single chunk from phoneme codepoints
782+ let tokens : [ String ] = phonemes. map { String ( $0) }
783+ let chunk = TextChunk (
784+ words: [ ] ,
785+ atoms: tokens,
786+ phonemes: tokens,
787+ totalFrames: 0 ,
788+ pauseAfterMs: 0 ,
789+ text: phonemes
790+ )
791+ let entries = try buildChunkEntries (
792+ from: [ chunk] ,
793+ vocabulary: vocabulary,
794+ preference: variantPreference,
795+ capacities: capacities
796+ )
797+
798+ struct ChunkSynthesisResult : Sendable { let index : Int ; let samples : [ Float ] ; let predictionTime : TimeInterval }
799+
800+ let embeddingDimension = try await modelCache. referenceEmbeddingDimension ( )
801+ let embeddingCache = try prepareVoiceEmbeddingCache (
802+ voice: voice,
803+ entries: entries,
804+ embeddingDimension: embeddingDimension
805+ )
806+
807+ let totalChunks = entries. count
808+ let groupedByTargetTokens = Dictionary ( grouping: entries, by: { $0. template. targetTokens } )
809+ let phasesShape : [ NSNumber ] = [ 1 , 9 ]
810+ try await multiArrayPool. preallocate ( shape: phasesShape, dataType: . float32, count: max ( 1 , totalChunks) , zeroFill: true )
811+ for (targetTokens, group) in groupedByTargetTokens {
812+ let shape : [ NSNumber ] = [ 1 , NSNumber ( value: targetTokens) ]
813+ try await multiArrayPool. preallocate ( shape: shape, dataType: . int32, count: max ( 1 , group. count * 2 ) , zeroFill: false )
814+ }
815+ let refShape : [ NSNumber ] = [ 1 , NSNumber ( value: embeddingDimension) ]
816+ try await multiArrayPool. preallocate ( shape: refShape, dataType: . float32, count: max ( 1 , totalChunks) , zeroFill: false )
817+
818+ let chunkTemplates = entries. map { $0. template }
819+ var chunkSampleBuffers = Array ( repeating: [ Float] ( ) , count: totalChunks)
820+ var allSamples : [ Float ] = [ ]
821+ let crossfadeMs = 8
822+ let samplesPerMillisecond = Double ( TtsConstants . audioSampleRate) / 1_000.0
823+ let crossfadeN = max ( 0 , Int ( Double ( crossfadeMs) * samplesPerMillisecond) )
824+ var totalPredictionTime : TimeInterval = 0
825+
826+ let chunkOutputs = try await withThrowingTaskGroup ( of: ChunkSynthesisResult . self) { group in
827+ for (index, entry) in entries. enumerated ( ) {
828+ let chunk = entry. chunk
829+ let inputIds = entry. inputIds
830+ let template = entry. template
831+ let chunkIndex = index
832+ guard let embeddingData = embeddingCache [ inputIds. count] else {
833+ throw TTSError . processingFailed ( " Missing voice embedding for chunk \( index + 1 ) with \( inputIds. count) tokens " )
834+ }
835+ let referenceVector = embeddingData. vector
836+ group. addTask ( priority: . userInitiated) {
837+ let ( samples, t) = try await synthesizeChunk (
838+ chunk,
839+ inputIds: inputIds,
840+ variant: template. variant,
841+ targetTokens: template. targetTokens,
842+ referenceVector: referenceVector
843+ )
844+ return ChunkSynthesisResult ( index: chunkIndex, samples: samples, predictionTime: t)
845+ }
846+ }
847+ var results : [ ChunkSynthesisResult ] = [ ]
848+ results. reserveCapacity ( totalChunks)
849+ for try await r in group { results. append ( r) }
850+ return results
851+ }
852+
853+ let sorted = chunkOutputs. sorted { $0. index < $1. index }
854+ var totalFrameCount = 0
855+ for output in sorted {
856+ let idx = output. index
857+ let samples = output. samples
858+ chunkSampleBuffers [ idx] = samples
859+ totalPredictionTime += output. predictionTime
860+ if TtsConstants . kokoroFrameSamples > 0 {
861+ totalFrameCount += samples. count / TtsConstants. kokoroFrameSamples
862+ }
863+ if idx == 0 { allSamples. append ( contentsOf: samples) ; continue }
864+ let prevPause = entries [ idx - 1 ] . chunk. pauseAfterMs
865+ if prevPause > 0 {
866+ let silenceCount = Int ( Double ( prevPause) * samplesPerMillisecond)
867+ if silenceCount > 0 { allSamples. append ( contentsOf: repeatElement ( 0.0 , count: silenceCount) ) }
868+ allSamples. append ( contentsOf: samples)
869+ } else {
870+ let n = min ( crossfadeN, allSamples. count, samples. count)
871+ if n > 0 {
872+ let tailStartIndex = allSamples. count - n
873+ var fadeIn = [ Float] ( repeating: 0 , count: n)
874+ if n == 1 { fadeIn [ 0 ] = 1 } else { var start : Float = 0 ; var step : Float = 1.0 / Float( n- 1 ) ; vDSP_vramp ( & start, & step, & fadeIn, 1 , vDSP_Length ( n) ) }
875+ var fadeOut = [ Float] ( repeating: 1 , count: n)
876+ // Avoid overlapping in-place access: compute fadeOut = 1 - fadeIn via simple loop
877+ if n == 1 {
878+ fadeOut [ 0 ] = 1 - fadeIn[ 0 ]
879+ } else {
880+ for j in 0 ..< n { fadeOut [ j] = 1 - fadeIn[ j] }
881+ }
882+ allSamples. withUnsafeMutableBufferPointer { allBuf in
883+ let tail = allBuf. baseAddress!. advanced ( by: tailStartIndex)
884+ vDSP_vmul ( tail, 1 , fadeOut, 1 , tail, 1 , vDSP_Length ( n) )
885+ }
886+ vDSP_vma ( Array ( samples [ 0 ..< n] ) , 1 , fadeIn, 1 , Array ( allSamples [ ( allSamples. count - n) ... ] ) , 1 , & allSamples[ ( allSamples. count - n) ] , 1 , vDSP_Length ( n) )
887+ if samples. count > n { allSamples. append ( contentsOf: samples [ n... ] ) }
888+ } else {
889+ allSamples. append ( contentsOf: samples)
890+ }
891+ }
892+ }
893+
894+ guard !allSamples. isEmpty else { throw TTSError . processingFailed ( " Synthesis produced no samples " ) }
895+ var maxMag : Float = 0
896+ vDSP_maxmgv ( allSamples, 1 , & maxMag, vDSP_Length ( allSamples. count) )
897+ if maxMag > 0 {
898+ let d = maxMag
899+ if d > 0 {
900+ let inv = 1.0 / d
901+ // Safe element-wise scaling to avoid overlapping accesses
902+ for k in allSamples. indices { allSamples [ k] *= inv }
903+ for idx in chunkSampleBuffers. indices {
904+ var buf = chunkSampleBuffers [ idx]
905+ for k in buf. indices { buf [ k] *= inv }
906+ chunkSampleBuffers [ idx] = buf
907+ }
908+ }
909+ }
910+ // If total audio is shorter than 5.0s, trim 4 frames from the end (Mandarin zh guard)
911+ do {
912+ let fiveSecSamples = Int ( 5.0 * Double( TtsConstants . audioSampleRate) )
913+ let trimSamples = TtsConstants . shortVariantGuardFrameCount * TtsConstants. kokoroFrameSamples
914+ if allSamples. count > trimSamples, allSamples. count < fiveSecSamples {
915+ allSamples. removeLast ( trimSamples)
916+ if let lastIdx = chunkSampleBuffers. indices. last {
917+ var last = chunkSampleBuffers [ lastIdx]
918+ if last. count > trimSamples { last. removeLast ( trimSamples) }
919+ chunkSampleBuffers [ lastIdx] = last
920+ }
921+ }
922+ }
923+ let audioData = try AudioWAV . data ( from: allSamples, sampleRate: Double ( TtsConstants . audioSampleRate) )
924+ let chunkInfos = zip ( chunkTemplates, chunkSampleBuffers) . map { t, s in
925+ ChunkInfo ( index: t. index, text: t. text, wordCount: t. wordCount, words: t. words, atoms: t. atoms, pauseAfterMs: t. pauseAfterMs, tokenCount: t. tokenCount, samples: s, variant: t. variant)
926+ }
927+ var footprints : [ ModelNames . TTS . Variant : Int ] = [ : ]
928+ for v in Set ( entries. map { $0. template. variant } ) {
929+ if let url = try ? modelBundleURL ( for: v) { footprints [ v] = directorySize ( at: url) }
930+ }
931+ let diagnostics = Diagnostics (
932+ variantFootprints: footprints,
933+ lexiconEntryCount: lexiconMetrics. entryCount,
934+ lexiconEstimatedBytes: lexiconMetrics. estimatedBytes,
935+ audioSampleBytes: allSamples. count * MemoryLayout< Float> . size,
936+ outputWavBytes: audioData. count
937+ )
938+ let base = SynthesisResult ( audio: audioData, chunks: chunkInfos, diagnostics: diagnostics)
939+ let factor = max ( 0.1 , voiceSpeed)
940+ if abs ( factor - 1.0 ) < 0.01 { return base }
941+ let adjustedChunks = base. chunks. map { c -> ChunkInfo in
942+ let stretched = adjustSamples ( c. samples, factor: factor)
943+ return ChunkInfo ( index: c. index, text: c. text, wordCount: c. wordCount, words: c. words, atoms: c. atoms, pauseAfterMs: c. pauseAfterMs, tokenCount: c. tokenCount, samples: stretched, variant: c. variant)
944+ }
945+ let combined = adjustedChunks. flatMap { $0. samples }
946+ let adjustedAudio = try AudioWAV . data ( from: combined, sampleRate: Double ( TtsConstants . audioSampleRate) )
947+ let updatedDiag = base. diagnostics? . updating ( audioSampleBytes: combined. count * MemoryLayout< Float> . size, outputWavBytes: adjustedAudio. count)
948+ return SynthesisResult ( audio: adjustedAudio, chunks: adjustedChunks, diagnostics: updatedDiag)
949+ }
950+
760951 private static func adjustSamples( _ samples: [ Float ] , factor: Float ) -> [ Float ] {
761952 let clamped = max ( 0.1 , factor)
762953 if abs ( clamped - 1.0 ) < 0.01 { return samples }
0 commit comments