Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GroupReadsByUmi may fail when marking duplicates including secondary/supplementary reads #964

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala
Original file line number Diff line number Diff line change
Expand Up @@ -208,21 +208,31 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] {
def trailingSoftClippedBases: Int = stats.trailingSoftClippedBases

/** Returns the number of bases that are hard-clipped at the start of the sequence. */
def leadingHardClippedBases = this.headOption.map { elem =>
def leadingHardClippedBases: Int = this.headOption.map { elem =>
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

public defs should have return types here and below

if (elem.operator == CigarOperator.H) elem.length else 0
}.getOrElse(0)

/** Returns the number of bases that are clipped at the start of the sequence. */
def leadingClippedBases: Int = leadingHardClippedBases + leadingSoftClippedBases

/** Returns the number of bases that are hard-clipped at the end of the sequence. */
def trailingHardClippedBases = this.lastOption.map { elem =>
def trailingHardClippedBases: Int = this.lastOption.map { elem =>
if (elem.operator == CigarOperator.H) elem.length else 0
}.getOrElse(0)

/** Returns the number of bases that are clipped at the end of the sequence. */
def trailingClippedBases: Int = trailingHardClippedBases + trailingSoftClippedBases

/** Returns the alignment start position adjusted for leading soft and hard clipped bases. */
def unclippedStart(start: Int): Int = {
start - this.iterator.takeWhile(_.operator.isClipping).map(_.length).sum
}

/** Returns the alignment end position adjusted for trailing soft and hard clipped bases. */
def unclippedEnd(end: Int): Int = {
end + this.reverseIterator.takeWhile(_.operator.isClipping).map(_.length).sum
}

/** Truncates the cigar based on either query or target length cutoff. */
private def truncate(len: Int, shouldCount: CigarElem => Boolean): Cigar = {
var pos = 1
Expand Down
80 changes: 73 additions & 7 deletions src/main/scala/com/fulcrumgenomics/bam/Bams.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
package com.fulcrumgenomics.bam

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.alignment.Cigar
import com.fulcrumgenomics.bam.api.SamOrder.Queryname
import com.fulcrumgenomics.bam.api._
import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator}
Expand All @@ -41,6 +42,56 @@
import java.io.Closeable
import scala.math.{max, min}


/** Class to store information about an alignment, as described in the SAM SA tag. */
private[bam] case class AlignmentInfo(refIndex: Int, start: Int, positiveStrand: Boolean, cigar: Option[Cigar], mapq: Int, nm: Int) {
def negativeStrand: Boolean = !positiveStrand
def refName(header: SAMFileHeader): String = header.getSequence(refIndex).getSequenceName
def mapped: Boolean = cigar.isDefined
def unmapped: Boolean = !mapped
private def _cigar: Cigar = cigar.getOrElse {
throw new IllegalStateException(s"Cannot get cigar for AlignmentInfo: ${this}")

Check warning on line 53 in src/main/scala/com/fulcrumgenomics/bam/Bams.scala

View check run for this annotation

Codecov / codecov/patch

src/main/scala/com/fulcrumgenomics/bam/Bams.scala#L53

Added line #L53 was not covered by tests
}
def end: Int = start + _cigar.lengthOnTarget - 1
def unclippedStart: Int = _cigar.unclippedStart(start)
def unclippedEnd: Int = _cigar.unclippedEnd(end)
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
def toSA(header: SAMFileHeader): String = {
val strand = if (positiveStrand) '+' else '-'
val cigar = this.cigar.getOrElse("*")
f"${refName(header)},${start},${strand},${cigar},${mapq},${nm}"
}
}

private[bam] object AlignmentInfo {
def apply(rec: SamRecord, mate: Boolean = false): AlignmentInfo = {
if (mate) {
val mateRefIndex = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
val mateCigar = if (rec.unpaired || rec.mateUnmapped) None else Some(rec.mateCigar.getOrElse {
throw new IllegalStateException(s"Mate CIGAR (Tag 'MC') not found for $rec, consider using SetMateInformation.")

Check warning on line 71 in src/main/scala/com/fulcrumgenomics/bam/Bams.scala

View check run for this annotation

Codecov / codecov/patch

src/main/scala/com/fulcrumgenomics/bam/Bams.scala#L71

Added line #L71 was not covered by tests
})
// NB: mateCigar has already checked for the existence of the MC tag, so using .get here is fine
val mateStart = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateStart
val mateStrand = if (rec.unpaired || rec.mateUnmapped) true else rec.matePositiveStrand
AlignmentInfo(mateRefIndex, mateStart, mateStrand, mateCigar, rec.mapq, 0)
} else {
val refIndex = if (rec.unmapped) Int.MaxValue else rec.refIndex
val positiveStrand = rec.positiveStrand
val start = if (rec.unmapped) Int.MaxValue else rec.start
AlignmentInfo(refIndex, start, positiveStrand, Some(rec.cigar), rec.mapq, rec.getOrElse(SAMTag.NM.name(), 0))
}
}

def apply(sa: String, header: SAMFileHeader): AlignmentInfo = {
val parts = sa.split(",")
require(parts.length == 6, f"Could not parse SA tag: ${sa}")
val refIndex = header.getSequenceIndex(parts(0))
AlignmentInfo(refIndex, parts(1).toInt, parts(2) == "+", Some(Cigar(parts(3))), parts(4).toInt, parts(5).toInt)
}
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
def toSA(rec: SamRecord): String = AlignmentInfo(rec).toSA(rec.header)
}

/**
* Class that represents all reads from a template within a BAM file.
*/
Expand Down Expand Up @@ -105,16 +156,27 @@
Template(x1, x2)
}

/** Fixes mate information and sets mate cigar and mate score on all primary and supplementary (but not secondary) records. */

/** Fixes mate information and sets mate cigar and mate score on all primary, secondary, and supplementary records. */
def fixMateInfo(): Unit = {
// Developer note: the mate score ("ms") tag is used by samtools markdup
for (primary <- r1; supp <- r2Supplementals) {
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
primary.get[Int]("AS").foreach(supp("ms") = _)
// Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to
nh13 marked this conversation as resolved.
Show resolved Hide resolved
// add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag!
val r1NonPrimary = r1Supplementals ++ r1Secondaries
val r2NonPrimary = r2Supplementals ++ r2Secondaries
for (primary <- r1; nonPrimary <- r2NonPrimary) {
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
primary.get[Int]("AS").foreach(nonPrimary("ms") = _)
nonPrimary(SAMTag.MQ.name()) = primary.mapq
nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary)
r2.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r))
}
for (primary <- r2; supp <- r1Supplementals) {
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
primary.get[Int]("AS").foreach(supp("ms") = _)
for (primary <- r2; nonPrimary <- r1NonPrimary) {
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
primary.get[Int]("AS").foreach(nonPrimary("ms") = _)
nonPrimary(SAMTag.MQ.name()) = primary.mapq
nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary)
r1.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r))
}
for (first <- r1; second <- r2) {
SamPairUtil.setMateInfo(first.asSam, second.asSam, true)
Expand All @@ -128,6 +190,10 @@
}

object Template {
/** The local SAM tag to store the alignment information of the primary alignment (in the same format as the SA tag) */
val ReadPrimarySamTag: String = "rp"
/** The local SAM tag to store the alignment information of the mate's primary alignment (in the same format as the SA tag) */
val MatePrimarySamTag: String = "mp"
/**
* Generates a Template for the next template in the buffered iterator. Assumes that the
* iterator is queryname sorted or grouped.
Expand Down
39 changes: 27 additions & 12 deletions src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

package com.fulcrumgenomics.bam.api

import com.fulcrumgenomics.bam.{Bams, AlignmentInfo, Template}
import com.fulcrumgenomics.umi.ConsensusTags
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.util.Murmur3
Expand Down Expand Up @@ -175,24 +176,38 @@
override val groupOrder: GroupOrder = GroupOrder.query
override val subSort: Option[String] = Some("template-coordinate")
override val sortkey: SamRecord => A = rec => {
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
val readNeg = rec.negativeStrand
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
// For non-secondary/non-supplementary alignments, or unpaired or mate unmapped reads, use the info in the record.
// Otherwise, use the info in the pa/pm tags.
val primary = if (!rec.secondary && !rec.supplementary) AlignmentInfo(rec) else {
val readPrimaryTag = rec.get[String](Template.ReadPrimarySamTag).getOrElse {
throw new IllegalStateException(

Check warning on line 183 in src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala

View check run for this annotation

Codecov / codecov/patch

src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala#L183

Added line #L183 was not covered by tests
s"Missing '${Template.ReadPrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation"
)
}
AlignmentInfo(readPrimaryTag, rec.header)
}
val mate = if ((!rec.secondary && !rec.supplementary) || rec.unpaired || rec.mateUnmapped) AlignmentInfo(rec, mate=true) else {
val matePrimaryTag = rec.get[String](Template.MatePrimarySamTag).getOrElse {
throw new IllegalStateException(

Check warning on line 191 in src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala

View check run for this annotation

Codecov / codecov/patch

src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala#L191

Added line #L191 was not covered by tests
s"Missing '${Template.MatePrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation"
)
}
AlignmentInfo(matePrimaryTag, rec.header)
}
val readPos = if (primary.unmapped) Int.MaxValue else if (primary.negativeStrand) primary.unclippedEnd else primary.unclippedStart
val matePos = if (mate.unmapped) Int.MaxValue else if (mate.negativeStrand) mate.unclippedEnd else mate.unclippedStart
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
val index: Int = m.lastIndexOf('/')
if (index >= 0) m.substring(0, index) else m
}.getOrElse("")

if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
if (primary.refIndex < mate.refIndex || (primary.refIndex == mate.refIndex && readPos < matePos) ||
(primary.refIndex == mate.refIndex && readPos == matePos && primary.positiveStrand)) {
TemplateCoordinateKey(primary.refIndex, mate.refIndex, readPos, matePos, primary.negativeStrand, mate.negativeStrand, lib, mid, rec.name, false)
}
else {
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
TemplateCoordinateKey(mate.refIndex, primary.refIndex, matePos, readPos, mate.negativeStrand, primary.negativeStrand, lib, mid, rec.name, true)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class TransientAttrs(private val rec: SamRecord) {

/**
* A trait that fgbio uses as a replacement for [[SAMRecord]]. The trait is self-typed as a
* [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this wasy so that
* [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this ways so that
* a) we can access superclass methods via [[SamRecordIntermediate]] but that self-typing here
* instead of extending hides the [[SAMRecord]] API from users of the class. The result is
* always a [[SAMRecord]] but isn't seen as such without casting.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ class GroupReadsByUmi

// Then output the records in the right order (assigned tag, read name, r1, r2)
templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => {
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => {
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => {
nh13 marked this conversation as resolved.
Show resolved Hide resolved
out += rec
})
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,39 @@ class AlignmentTest extends UnitSpec {
Cigar("10M2D20M").reverse.toString shouldBe "20M2D10M"
}

private case class ClippingTestCase
(cigar: String, clippedBases: Int,
leadingSoftClippedBases: Int, trailingSoftClippedBases: Int,
leadingHardClippedBases: Int, trailingHardClippedBases: Int,
leadingClippedBases: Int, trailingClippedBases: Int,
unclippedStart: Int, unclippedEnd: Int)

Seq(
ClippingTestCase("50M", 0, 0, 0, 0, 0, 0, 0, 0, 0), // no clipping
ClippingTestCase("20S50M", 20, 20, 0, 0, 0, 20, 0, -20, 0), // leading soft clipping
ClippingTestCase("50M10S", 10, 0, 10, 0, 0, 0, 10, 0, 10), // trailing soft clipping
ClippingTestCase("20S50M10S", 30, 20, 10, 0, 0, 20, 10, -20, 10), // leading and trailing soft clipping
ClippingTestCase("20H50M", 20, 0, 0, 20, 0, 20, 0, -20, 0), // leading hard clipping
ClippingTestCase("50M10H", 10, 0, 0, 0, 10, 0, 10, 0, 10), // trailing hard clipping
ClippingTestCase("20H50M10H", 30, 0, 0, 20, 10, 20, 10, -20, 10), // leading and trailing hard clipping
ClippingTestCase("15H5S50M", 20, 5, 0, 15, 0, 20, 0, -20, 0), // leading hard-the-soft clipping
ClippingTestCase("50M5S15H", 20, 0, 5, 0, 15, 0, 20, 0, 20), // trailing hard-the-soft clipping
ClippingTestCase("20H15S50M10S5H", 50, 15, 10, 20, 5, 35, 15, -35, 15), // leading and trailing hard-the-soft clipping
).foreach { testCase =>
"Cigar clipping methods" should s"account for leading and trailing clipping in ${testCase.cigar}" in {
val cigar = Cigar(testCase.cigar)
cigar.clippedBases shouldBe testCase.clippedBases
cigar.leadingSoftClippedBases shouldBe testCase.leadingSoftClippedBases
cigar.trailingSoftClippedBases shouldBe testCase.trailingSoftClippedBases
cigar.leadingHardClippedBases shouldBe testCase.leadingHardClippedBases
cigar.trailingHardClippedBases shouldBe testCase.trailingHardClippedBases
cigar.leadingClippedBases shouldBe testCase.leadingClippedBases
cigar.trailingClippedBases shouldBe testCase.trailingClippedBases
cigar.unclippedStart(100) shouldBe testCase.unclippedStart + 100
cigar.unclippedEnd(200) shouldBe testCase.unclippedEnd + 200
}
}

/////////////////////////////////////////////////////////////////////////////
// Tests for the Alignment class
/////////////////////////////////////////////////////////////////////////////
Expand Down
30 changes: 29 additions & 1 deletion src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@
package com.fulcrumgenomics.bam.api

import java.util.Random

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.Bams
import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator}
import com.fulcrumgenomics.commons.util.SimpleCounter
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
import com.fulcrumgenomics.util.Io
import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator}
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.util.Murmur3
Expand Down Expand Up @@ -217,4 +219,30 @@ class SamOrderTest extends UnitSpec {

actual should contain theSameElementsInOrderAs expected
}

it should "handle supplementary alignments" in {
val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Queryname))
val exp = ListBuffer[SamRecord]()
// primary read pairs for q1, that map to different contigs
exp ++= builder.addPair("q1", contig=1, contig2=Some(2), start1=66, start2=47, cigar1="60M40S", cigar2="55M45S", strand2=Plus)
// supplementary R2 (ignore R1), which maps to the same chromosome as the primary R1
val Seq(_, s2) = builder.addPair("q1", contig=1, contig2=Some(1), start1=66, start2=66, cigar1="60M40S", strand2=Minus)
s2.supplementary = true
s2.properlyPaired = true
exp += s2
// primary read pairs for q2, that map to different contigs, but earlier that q1
exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S")

// Fix the mate information. Note: sorting here to get a template-iterator will write the records to disk first,
// so we cannot use the records in builder/exp.
val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template =>
template.fixMateInfo()
template.allReads
}.toList

val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup")
val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)

actual should contain theSameElementsInOrderAs expected
}
}
Loading
Loading