From 5c81990e53c7a5ba2de0d0cc18bc5d012b7c71d0 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 21 Nov 2025 19:34:05 +1300 Subject: [PATCH 1/3] Make nibble2base() et al's source parameter a pointer to const --- sam_internal.h | 6 +++--- simd.c | 8 ++++---- test/test_nibbles.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sam_internal.h b/sam_internal.h index 161782aa3..027e5d574 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -69,7 +69,7 @@ static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) { * for (i = 0; i < len; i++) * seq[i] = seq_nt16_str[bam_seqi(nib, i)]; */ -static inline void nibble2base_default(uint8_t *nib, char *seq, int len) { +static inline void nibble2base_default(const uint8_t *nib, char *seq, int len) { static const char code2base[512] = "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N" "A=AAACAMAGARASAVATAWAYAHAKADABAN" @@ -108,9 +108,9 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) { #define BUILDING_SIMD_NIBBLE2BASE #endif -static inline void nibble2base(uint8_t *nib, char *seq, int len) { +static inline void nibble2base(const uint8_t *nib, char *seq, int len) { #ifdef BUILDING_SIMD_NIBBLE2BASE - extern void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len); + extern void (*htslib_nibble2base)(const uint8_t *nib, char *seq, int len); htslib_nibble2base(nib, seq, len); #else nibble2base_default(nib, seq, len); diff --git a/simd.c b/simd.c index 62a2e7766..c7484ed10 100644 --- a/simd.c +++ b/simd.c @@ -116,7 +116,7 @@ static inline int cpu_supports_neon(void) { #ifdef BUILDING_SIMD_NIBBLE2BASE -void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; +void (*htslib_nibble2base)(const uint8_t *nib, char *seq, int len) = nibble2base_default; #if defined __x86_64__ @@ -130,10 +130,10 @@ void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_defau */ __attribute__((target("ssse3"))) -static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { +static void nibble2base_ssse3(const uint8_t *nib, char *seq, int len) { const char *seq_end_ptr = seq + len; char *seq_cursor = seq; - uint8_t *nibble_cursor = nib; + const uint8_t *nibble_cursor = nib; const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1); __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str); /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes @@ -170,7 +170,7 @@ static void nibble2base_resolve(void) { #elif defined __ARM_NEON -static void nibble2base_neon(uint8_t *nib, char *seq0, int len) { +static void nibble2base_neon(const uint8_t *nib, char *seq0, int len) { uint8x16_t low_nibbles_mask = vdupq_n_u8(0x0f); uint8x16_t nuc_lookup_vec = vld1q_u8((const uint8_t *) seq_nt16_str); #ifndef __aarch64__ diff --git a/test/test_nibbles.c b/test/test_nibbles.c index 1ef3456ea..5ca4aa58c 100644 --- a/test/test_nibbles.c +++ b/test/test_nibbles.c @@ -66,7 +66,7 @@ char *fmttime(long long elapsed) { return buf; } -void nibble2base_single(uint8_t *nib, char *seq, int len) { +void nibble2base_single(const uint8_t *nib, char *seq, int len) { int i; for (i = 0; i < len; i++) seq[i] = seq_nt16_str[bam_seqi(nib, i)]; From 2d2d5aa690ed9b575e7ae4cc4a2b98680a6014c7 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 11 Dec 2025 21:00:19 +1300 Subject: [PATCH 2/3] Reduce local variable shadowing and fix second COPY_MINUS_N function The optimised COPY_MINUS_N() implementation declares its own locals. Make the simple COPY_MINUS_N() declare its own local i rather than shadowing a local of that name from its caller. --- sam.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sam.c b/sam.c index 0a386fe1d..10565da00 100644 --- a/sam.c +++ b/sam.c @@ -2683,6 +2683,7 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) // Basic version which operates a byte at a time #define COPY_MINUS_N(to,from,n,l,failed) do { \ uint8_t uflow = 0; \ + int i; \ for (i = 0; i < (l); ++i) { \ (to)[i] = (from)[i] - (n); \ uflow |= (uint8_t) (to)[i]; \ @@ -2699,7 +2700,7 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) uint8_t *t; char *p = s->s, *q; - int i, overflow = 0; + int overflow = 0; char logbuf[40]; hts_pos_t cigreflen; bam1_core_t *c = &b->core; @@ -2796,8 +2797,8 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) c->l_qseq = p - q - 1; hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname)); _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length"); - i = (c->l_qseq + 1) >> 1; - _get_mem(uint8_t, &t, b, i); + int half_l_qseq = (c->l_qseq + 1) >> 1; + _get_mem(uint8_t, &t, b, half_l_qseq); unsigned int lqs2 = c->l_qseq&~1, i; for (i = 0; i < lqs2; i+=2) @@ -4034,7 +4035,6 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { // The SAMTags spec recommends (but not requires) separating // barcodes with hyphen ('-'). - size_t i; for (i = 0; i < UMI_len; i++) UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so]) ? x->name.s[i+match[1].rm_so] @@ -4161,7 +4161,6 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { // We don't support multi-threaded SAM parsing with seeks yet. - int ret; if ((ret = sam_state_destroy(fp)) < 0) { errno = -ret; return -2; @@ -4347,7 +4346,7 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len if (c->l_qseq) { // seq and qual - uint8_t *s = bam_get_seq(b); + s = bam_get_seq(b); if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; char *cp = str->s + str->l; From 9de8e4f885f56bd0419d51fb7affc65726718ff4 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 21 Nov 2025 19:59:29 +1300 Subject: [PATCH 3/3] Add API functions to parse and format SEQ and QUAL fields --- htslib/sam.h | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++ sam.c | 45 ++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/htslib/sam.h b/htslib/sam.h index 53d0059a9..116cd21cd 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1143,6 +1143,75 @@ ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t * HTSLIB_EXPORT ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b); +/// Unpack nibble-encoded sequence into a char buffer +/** @param dest Destination character buffer + @param seq Pointer to sequence bases packed as per bam1_t + @param seqlen Number of bases at @p seq to be unpacked + +Unpacks nucleotides encoded in the (seqlen+1)/2 bytes at @p seq. +Writes exactly @p seqlen characters to @p dest. If NUL-termination +is desired, the calling code should add '\0' appropriately itself. + */ +HTSLIB_EXPORT +void sam_format_seq(char *dest, const uint8_t *seq, size_t seqlen); + +/// Unpack BAM record's sequence field into a char buffer +/** @param dest Destination character buffer + @param b Source alignment record + */ +static inline void bam_format_seq(char *dest, const bam1_t *b) { + sam_format_seq(dest, bam_get_seq(b), b->core.l_qseq); +} + +/// Decode base qualities into a char buffer +/** @param dest Destination character buffer + @param qual Pointer to base qualities encoded as per bam1_t + @param quallen Number of base qualities at @p seq to be decoded + +Decodes base qualities encoded in the @p quallen bytes at @p seq. +Writes exactly @p quallen characters to @p dest. If the base qualities +are missing (encoded as '\xff'), writes '*' followed by quallen-1 NULs. +Otherwise if NUL-termination is desired, the calling code should add +'\0' appropriately itself. + */ +HTSLIB_EXPORT +void sam_format_qual(char *dest, const uint8_t *qual, size_t quallen); + +/// Decode BAM record's base qualities field into a char buffer +/** @param dest Destination character buffer + @param b Source alignment record + */ +static inline void bam_format_qual(char *dest, const bam1_t *b) { + sam_format_qual(dest, bam_get_qual(b), b->core.l_qseq); +} + +/// Pack ASCII sequence string into nibble-encoded sequence buffer +/** @param seq Destination sequence base buffer to be packed as per bam1_t + @param src Pointer to ASCII-encoded base characters + @param len Number of base characters at @p src to be packed + @return The number of bytes written to @p seq, or negative on error + +Packs @p len ASCII base characters into a nibble-encoded sequence buffer. +If the text at @p src is "*\0", writes 0 bytes to @p seq; otherwise writes +exactly floor((len+1)/2) bytes to @p seq. + */ +HTSLIB_EXPORT +int sam_parse_seq(uint8_t *seq, const char *src, size_t len); + +/// Encode ASCII base qualities into uint8_t buffer +/** @param qual Destination base quality buffer to be encoded as per bam1_t + @param src Pointer to ASCII-encoded base quality characters + @param len Number of base qualities at @p src to be encoded + @return The number of bytes written to @p qual, or negative on error + +Encodes @p len ASCII base qualities into a quality buffer encoded as per bam1_t. +If the text at @p src is "*\0" (i.e., if len==1 and src is '*', or len>1 and src +is '*' followed by a NUL), encodes as @p len missing ('\xff') qualities. Always +writes exactly @p len bytes to @p qual. + */ +HTSLIB_EXPORT +int sam_parse_qual(uint8_t *qual, const char *src, size_t len); + /************************* *** BAM/CRAM indexing *** *************************/ diff --git a/sam.c b/sam.c index 10565da00..0a19121a7 100644 --- a/sam.c +++ b/sam.c @@ -4400,6 +4400,51 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) return sam_format1_append(h, b, str); } +void sam_format_seq(char *dest, const uint8_t *seq, size_t seqlen) +{ + nibble2base(seq, dest, seqlen); +} + +void sam_format_qual(char *dest, const uint8_t *qual, size_t quallen) +{ + if (qual[0] != 0xff) { + add33((uint8_t *) dest, qual, quallen); + } + else { + memset(dest, '\0', quallen); + dest[0] = '*'; + } +} + +int sam_parse_seq(uint8_t *seq, const char *src, size_t len) +{ + if (src[0] == '*' && (len == 1 || src[1] == '\0')) return 0; + + const uint8_t *usrc = (const uint8_t *) src; + uint8_t *seq0 = seq; + size_t i, even_len = len & ~1; + for (i = 0; i < even_len; i += 2) + *seq++ = (seq_nt16_table[usrc[i]] << 4) | seq_nt16_table[usrc[i + 1]]; + for (; i < len; i++) + *seq++ = seq_nt16_table[usrc[i]] << 4; + + return seq - seq0; +} + +int sam_parse_qual(uint8_t *qual, const char *src, size_t len) +{ + if (src[0] == '*' && (len == 1 || src[1] == '\0')) { + memset(qual, 0xff, len); + } + else { + int failed = 0; + COPY_MINUS_N(qual, src, 33, len, failed); + if (failed) { errno = EINVAL; return -1; } + } + + return len; +} + static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end); int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) {